No OneTemporary
Actions

Size

3 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: head/contrib/compiler-rt
	===================================================================
	--- head/contrib/compiler-rt (revision 328816)
	+++ head/contrib/compiler-rt (revision 328817)

	Property changes on: head/contrib/compiler-rt
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/compiler-rt/dist-release_60:r328751-328794
	Index: head/contrib/libc++
	===================================================================
	--- head/contrib/libc++ (revision 328816)
	+++ head/contrib/libc++ (revision 328817)

	Property changes on: head/contrib/libc++
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/libc++/dist-release_60:r328751-328794
	Index: head/contrib/llvm/include/llvm/CodeGen/Passes.h
	===================================================================
	--- head/contrib/llvm/include/llvm/CodeGen/Passes.h (revision 328816)
	+++ head/contrib/llvm/include/llvm/CodeGen/Passes.h (revision 328817)
	@@ -1,422 +1,425 @@
	//===-- Passes.h - Target independent code generation passes ----- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines interfaces to access the target independent code generation
	// passes provided by the LLVM backend.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CODEGEN_PASSES_H
	#define LLVM_CODEGEN_PASSES_H

	#include <functional>
	#include <string>

	namespace llvm {

	class FunctionPass;
	class MachineFunction;
	class MachineFunctionPass;
	class ModulePass;
	class Pass;
	class TargetMachine;
	class TargetRegisterClass;
	class raw_ostream;

	} // End llvm namespace

	/// List of target independent CodeGen pass IDs.
	namespace llvm {
	FunctionPass *createAtomicExpandPass();

	/// createUnreachableBlockEliminationPass - The LLVM code generator does not
	/// work well with unreachable basic blocks (what live ranges make sense for a
	/// block that cannot be reached?). As such, a code generator should either
	/// not instruction select unreachable blocks, or run this pass as its
	/// last LLVM modifying pass to clean up blocks that are not reachable from
	/// the entry block.
	FunctionPass *createUnreachableBlockEliminationPass();

	/// MachineFunctionPrinter pass - This pass prints out the machine function to
	/// the given stream as a debugging tool.
	MachineFunctionPass *
	createMachineFunctionPrinterPass(raw_ostream &OS,
	const std::string &Banner ="");

	/// MIRPrinting pass - this pass prints out the LLVM IR into the given stream
	/// using the MIR serialization format.
	MachineFunctionPass *createPrintMIRPass(raw_ostream &OS);

	/// This pass resets a MachineFunction when it has the FailedISel property
	/// as if it was just created.
	/// If EmitFallbackDiag is true, the pass will emit a
	/// DiagnosticInfoISelFallback for every MachineFunction it resets.
	/// If AbortOnFailedISel is true, abort compilation instead of resetting.
	MachineFunctionPass *createResetMachineFunctionPass(bool EmitFallbackDiag,
	bool AbortOnFailedISel);

	/// createCodeGenPreparePass - Transform the code to expose more pattern
	/// matching during instruction selection.
	FunctionPass *createCodeGenPreparePass();

	/// createScalarizeMaskedMemIntrinPass - Replace masked load, store, gather
	/// and scatter intrinsics with scalar code when target doesn't support them.
	FunctionPass *createScalarizeMaskedMemIntrinPass();

	/// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg
	/// load-linked/store-conditional loops.
	extern char &AtomicExpandID;

	/// MachineLoopInfo - This pass is a loop analysis pass.
	extern char &MachineLoopInfoID;

	/// MachineDominators - This pass is a machine dominators analysis pass.
	extern char &MachineDominatorsID;

	/// MachineDominanaceFrontier - This pass is a machine dominators analysis pass.
	extern char &MachineDominanceFrontierID;

	/// MachineRegionInfo - This pass computes SESE regions for machine functions.
	extern char &MachineRegionInfoPassID;

	/// EdgeBundles analysis - Bundle machine CFG edges.
	extern char &EdgeBundlesID;

	/// LiveVariables pass - This pass computes the set of blocks in which each
	/// variable is life and sets machine operand kill flags.
	extern char &LiveVariablesID;

	/// PHIElimination - This pass eliminates machine instruction PHI nodes
	/// by inserting copy instructions. This destroys SSA information, but is the
	/// desired input for some register allocators. This pass is "required" by
	/// these register allocator like this: AU.addRequiredID(PHIEliminationID);
	extern char &PHIEliminationID;

	/// LiveIntervals - This analysis keeps track of the live ranges of virtual
	/// and physical registers.
	extern char &LiveIntervalsID;

	/// LiveStacks pass. An analysis keeping track of the liveness of stack slots.
	extern char &LiveStacksID;

	/// TwoAddressInstruction - This pass reduces two-address instructions to
	/// use two operands. This destroys SSA information but it is desired by
	/// register allocators.
	extern char &TwoAddressInstructionPassID;

	/// ProcessImpicitDefs pass - This pass removes IMPLICIT_DEFs.
	extern char &ProcessImplicitDefsID;

	/// RegisterCoalescer - This pass merges live ranges to eliminate copies.
	extern char &RegisterCoalescerID;

	/// MachineScheduler - This pass schedules machine instructions.
	extern char &MachineSchedulerID;

	/// PostMachineScheduler - This pass schedules machine instructions postRA.
	extern char &PostMachineSchedulerID;

	/// SpillPlacement analysis. Suggest optimal placement of spill code between
	/// basic blocks.
	extern char &SpillPlacementID;

	/// ShrinkWrap pass. Look for the best place to insert save and restore
	// instruction and update the MachineFunctionInfo with that information.
	extern char &ShrinkWrapID;

	/// LiveRangeShrink pass. Move instruction close to its definition to shrink
	/// the definition's live range.
	extern char &LiveRangeShrinkID;

	/// Greedy register allocator.
	extern char &RAGreedyID;

	/// Basic register allocator.
	extern char &RABasicID;

	/// VirtRegRewriter pass. Rewrite virtual registers to physical registers as
	/// assigned in VirtRegMap.
	extern char &VirtRegRewriterID;

	/// UnreachableMachineBlockElimination - This pass removes unreachable
	/// machine basic blocks.
	extern char &UnreachableMachineBlockElimID;

	/// DeadMachineInstructionElim - This pass removes dead machine instructions.
	extern char &DeadMachineInstructionElimID;

	/// This pass adds dead/undef flags after analyzing subregister lanes.
	extern char &DetectDeadLanesID;

	/// FastRegisterAllocation Pass - This pass register allocates as fast as
	/// possible. It is best suited for debug code where live ranges are short.
	///
	FunctionPass *createFastRegisterAllocator();

	/// BasicRegisterAllocation Pass - This pass implements a degenerate global
	/// register allocator using the basic regalloc framework.
	///
	FunctionPass *createBasicRegisterAllocator();

	/// Greedy register allocation pass - This pass implements a global register
	/// allocator for optimized builds.
	///
	FunctionPass *createGreedyRegisterAllocator();

	/// PBQPRegisterAllocation Pass - This pass implements the Partitioned Boolean
	/// Quadratic Prograaming (PBQP) based register allocator.
	///
	FunctionPass *createDefaultPBQPRegisterAllocator();

	/// PrologEpilogCodeInserter - This pass inserts prolog and epilog code,
	/// and eliminates abstract frame references.
	extern char &PrologEpilogCodeInserterID;
	MachineFunctionPass *createPrologEpilogInserterPass();

	/// ExpandPostRAPseudos - This pass expands pseudo instructions after
	/// register allocation.
	extern char &ExpandPostRAPseudosID;

	/// createPostRAHazardRecognizer - This pass runs the post-ra hazard
	/// recognizer.
	extern char &PostRAHazardRecognizerID;

	/// createPostRAScheduler - This pass performs post register allocation
	/// scheduling.
	extern char &PostRASchedulerID;

	/// BranchFolding - This pass performs machine code CFG based
	/// optimizations to delete branches to branches, eliminate branches to
	/// successor blocks (creating fall throughs), and eliminating branches over
	/// branches.
	extern char &BranchFolderPassID;

	/// BranchRelaxation - This pass replaces branches that need to jump further
	/// than is supported by a branch instruction.
	extern char &BranchRelaxationPassID;

	/// MachineFunctionPrinterPass - This pass prints out MachineInstr's.
	extern char &MachineFunctionPrinterPassID;

	/// MIRPrintingPass - this pass prints out the LLVM IR using the MIR
	/// serialization format.
	extern char &MIRPrintingPassID;

	/// TailDuplicate - Duplicate blocks with unconditional branches
	/// into tails of their predecessors.
	extern char &TailDuplicateID;

	/// MachineTraceMetrics - This pass computes critical path and CPU resource
	/// usage in an ensemble of traces.
	extern char &MachineTraceMetricsID;

	/// EarlyIfConverter - This pass performs if-conversion on SSA form by
	/// inserting cmov instructions.
	extern char &EarlyIfConverterID;

	/// This pass performs instruction combining using trace metrics to estimate
	/// critical-path and resource depth.
	extern char &MachineCombinerID;

	/// StackSlotColoring - This pass performs stack coloring and merging.
	/// It merges disjoint allocas to reduce the stack size.
	extern char &StackColoringID;

	/// IfConverter - This pass performs machine code if conversion.
	extern char &IfConverterID;

	FunctionPass *createIfConverter(
	std::function<bool(const MachineFunction &)> Ftor);

	/// MachineBlockPlacement - This pass places basic blocks based on branch
	/// probabilities.
	extern char &MachineBlockPlacementID;

	/// MachineBlockPlacementStats - This pass collects statistics about the
	/// basic block placement using branch probabilities and block frequency
	/// information.
	extern char &MachineBlockPlacementStatsID;

	/// GCLowering Pass - Used by gc.root to perform its default lowering
	/// operations.
	FunctionPass *createGCLoweringPass();

	/// ShadowStackGCLowering - Implements the custom lowering mechanism
	/// used by the shadow stack GC. Only runs on functions which opt in to
	/// the shadow stack collector.
	FunctionPass *createShadowStackGCLoweringPass();

	/// GCMachineCodeAnalysis - Target-independent pass to mark safe points
	/// in machine code. Must be added very late during code generation, just
	/// prior to output, and importantly after all CFG transformations (such as
	/// branch folding).
	extern char &GCMachineCodeAnalysisID;

	/// Creates a pass to print GC metadata.
	///
	FunctionPass *createGCInfoPrinter(raw_ostream &OS);

	/// MachineCSE - This pass performs global CSE on machine instructions.
	extern char &MachineCSEID;

	/// ImplicitNullChecks - This pass folds null pointer checks into nearby
	/// memory operations.
	extern char &ImplicitNullChecksID;

	/// MachineLICM - This pass performs LICM on machine instructions.
	extern char &MachineLICMID;

	/// MachineSinking - This pass performs sinking on machine instructions.
	extern char &MachineSinkingID;

	/// MachineCopyPropagation - This pass performs copy propagation on
	/// machine instructions.
	extern char &MachineCopyPropagationID;

	/// PeepholeOptimizer - This pass performs peephole optimizations -
	/// like extension and comparison eliminations.
	extern char &PeepholeOptimizerID;

	/// OptimizePHIs - This pass optimizes machine instruction PHIs
	/// to take advantage of opportunities created during DAG legalization.
	extern char &OptimizePHIsID;

	/// StackSlotColoring - This pass performs stack slot coloring.
	extern char &StackSlotColoringID;

	/// \brief This pass lays out funclets contiguously.
	extern char &FuncletLayoutID;

	/// This pass inserts the XRay instrumentation sleds if they are supported by
	/// the target platform.
	extern char &XRayInstrumentationID;

	/// This pass inserts FEntry calls
	extern char &FEntryInserterID;

	/// \brief This pass implements the "patchable-function" attribute.
	extern char &PatchableFunctionID;

	/// createStackProtectorPass - This pass adds stack protectors to functions.
	///
	FunctionPass *createStackProtectorPass();

	/// createMachineVerifierPass - This pass verifies cenerated machine code
	/// instructions for correctness.
	///
	FunctionPass *createMachineVerifierPass(const std::string& Banner);

	/// createDwarfEHPass - This pass mulches exception handling code into a form
	/// adapted to code generation. Required if using dwarf exception handling.
	FunctionPass *createDwarfEHPass();

	/// createWinEHPass - Prepares personality functions used by MSVC on Windows,
	/// in addition to the Itanium LSDA based personalities.
	FunctionPass *createWinEHPass();

	/// createSjLjEHPreparePass - This pass adapts exception handling code to use
	/// the GCC-style builtin setjmp/longjmp (sjlj) to handling EH control flow.
	///
	FunctionPass *createSjLjEHPreparePass();

	/// LocalStackSlotAllocation - This pass assigns local frame indices to stack
	/// slots relative to one another and allocates base registers to access them
	/// when it is estimated by the target to be out of range of normal frame
	/// pointer or stack pointer index addressing.
	extern char &LocalStackSlotAllocationID;

	/// ExpandISelPseudos - This pass expands pseudo-instructions.
	extern char &ExpandISelPseudosID;

	/// UnpackMachineBundles - This pass unpack machine instruction bundles.
	extern char &UnpackMachineBundlesID;

	FunctionPass *
	createUnpackMachineBundles(std::function<bool(const MachineFunction &)> Ftor);

	/// FinalizeMachineBundles - This pass finalize machine instruction
	/// bundles (created earlier, e.g. during pre-RA scheduling).
	extern char &FinalizeMachineBundlesID;

	/// StackMapLiveness - This pass analyses the register live-out set of
	/// stackmap/patchpoint intrinsics and attaches the calculated information to
	/// the intrinsic for later emission to the StackMap.
	extern char &StackMapLivenessID;

	/// LiveDebugValues pass
	extern char &LiveDebugValuesID;

	/// createJumpInstrTables - This pass creates jump-instruction tables.
	ModulePass *createJumpInstrTablesPass();

	/// createForwardControlFlowIntegrityPass - This pass adds control-flow
	/// integrity.
	ModulePass *createForwardControlFlowIntegrityPass();

	/// InterleavedAccess Pass - This pass identifies and matches interleaved
	/// memory accesses to target specific intrinsics.
	///
	FunctionPass *createInterleavedAccessPass();

	/// LowerEmuTLS - This pass generates __emutls_[vt].xyz variables for all
	/// TLS variables for the emulated TLS model.
	///
	ModulePass *createLowerEmuTLSPass();

	/// This pass lowers the @llvm.load.relative intrinsic to instructions.
	/// This is unsafe to do earlier because a pass may combine the constant
	/// initializer into the load, which may result in an overflowing evaluation.
	ModulePass *createPreISelIntrinsicLoweringPass();

	/// GlobalMerge - This pass merges internal (by default) globals into structs
	/// to enable reuse of a base pointer by indexed addressing modes.
	/// It can also be configured to focus on size optimizations only.
	///
	Pass createGlobalMergePass(const TargetMachine TM, unsigned MaximalOffset,
	bool OnlyOptimizeForSize = false,
	bool MergeExternalByDefault = false);

	/// This pass splits the stack into a safe stack and an unsafe stack to
	/// protect against stack-based overflow vulnerabilities.
	FunctionPass *createSafeStackPass();

	/// This pass detects subregister lanes in a virtual register that are used
	/// independently of other lanes and splits them into separate virtual
	/// registers.
	extern char &RenameIndependentSubregsID;

	/// This pass is executed POST-RA to collect which physical registers are
	/// preserved by given machine function.
	FunctionPass *createRegUsageInfoCollector();

	/// Return a MachineFunction pass that identifies call sites
	/// and propagates register usage information of callee to caller
	/// if available with PysicalRegisterUsageInfo pass.
	FunctionPass *createRegUsageInfoPropPass();

	/// This pass performs software pipelining on machine instructions.
	extern char &MachinePipelinerID;

	/// This pass frees the memory occupied by the MachineFunction.
	FunctionPass *createFreeMachineFunctionPass();

	/// This pass performs outlining on machine instructions directly before
	/// printing assembly.
	ModulePass *createMachineOutlinerPass(bool OutlineFromLinkOnceODRs = false);

	/// This pass expands the experimental reduction intrinsics into sequences of
	/// shuffles.
	FunctionPass *createExpandReductionsPass();

	// This pass expands memcmp() to load/stores.
	FunctionPass *createExpandMemCmpPass();

	+ // This pass expands indirectbr instructions.
	+ FunctionPass *createIndirectBrExpandPass();
	+
	} // End llvm namespace

	#endif
	Index: head/contrib/llvm/include/llvm/CodeGen/TargetInstrInfo.h
	===================================================================
	--- head/contrib/llvm/include/llvm/CodeGen/TargetInstrInfo.h (revision 328816)
	+++ head/contrib/llvm/include/llvm/CodeGen/TargetInstrInfo.h (revision 328817)
	@@ -1,1691 +1,1695 @@
	//===- llvm/CodeGen/TargetInstrInfo.h - Instruction Info --------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the target machine instruction set to the code generator.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_TARGET_TARGETINSTRINFO_H
	#define LLVM_TARGET_TARGETINSTRINFO_H

	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseMapInfo.h"
	#include "llvm/ADT/None.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineCombinerPattern.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineLoopInfo.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/PseudoSourceValue.h"
	#include "llvm/MC/MCInstrInfo.h"
	#include "llvm/Support/BranchProbability.h"
	#include "llvm/Support/ErrorHandling.h"
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <utility>
	#include <vector>

	namespace llvm {

	class DFAPacketizer;
	class InstrItineraryData;
	class LiveIntervals;
	class LiveVariables;
	class MachineMemOperand;
	class MachineRegisterInfo;
	class MCAsmInfo;
	class MCInst;
	struct MCSchedModel;
	class Module;
	class ScheduleDAG;
	class ScheduleHazardRecognizer;
	class SDNode;
	class SelectionDAG;
	class RegScavenger;
	class TargetRegisterClass;
	class TargetRegisterInfo;
	class TargetSchedModel;
	class TargetSubtargetInfo;

	template <class T> class SmallVectorImpl;

	//---------------------------------------------------------------------------
	///
	/// TargetInstrInfo - Interface to description of machine instruction set
	///
	class TargetInstrInfo : public MCInstrInfo {
	public:
	TargetInstrInfo(unsigned CFSetupOpcode = ~0u, unsigned CFDestroyOpcode = ~0u,
	unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u)
	: CallFrameSetupOpcode(CFSetupOpcode),
	CallFrameDestroyOpcode(CFDestroyOpcode), CatchRetOpcode(CatchRetOpcode),
	ReturnOpcode(ReturnOpcode) {}
	TargetInstrInfo(const TargetInstrInfo &) = delete;
	TargetInstrInfo &operator=(const TargetInstrInfo &) = delete;
	virtual ~TargetInstrInfo();

	static bool isGenericOpcode(unsigned Opc) {
	return Opc <= TargetOpcode::GENERIC_OP_END;
	}

	/// Given a machine instruction descriptor, returns the register
	/// class constraint for OpNum, or NULL.
	const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum,
	const TargetRegisterInfo *TRI,
	const MachineFunction &MF) const;

	/// Return true if the instruction is trivially rematerializable, meaning it
	/// has no side effects and requires no operands that aren't always available.
	/// This means the only allowed uses are constants and unallocatable physical
	/// registers so that the instructions result is independent of the place
	/// in the function.
	bool isTriviallyReMaterializable(const MachineInstr &MI,
	AliasAnalysis *AA = nullptr) const {
	return MI.getOpcode() == TargetOpcode::IMPLICIT_DEF \|\|
	(MI.getDesc().isRematerializable() &&
	(isReallyTriviallyReMaterializable(MI, AA) \|\|
	isReallyTriviallyReMaterializableGeneric(MI, AA)));
	}

	protected:
	/// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
	/// set, this hook lets the target specify whether the instruction is actually
	/// trivially rematerializable, taking into consideration its operands. This
	/// predicate must return false if the instruction has any side effects other
	/// than producing a value, or if it requres any address registers that are
	/// not always available.
	/// Requirements must be check as stated in isTriviallyReMaterializable() .
	virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
	AliasAnalysis *AA) const {
	return false;
	}

	/// This method commutes the operands of the given machine instruction MI.
	/// The operands to be commuted are specified by their indices OpIdx1 and
	/// OpIdx2.
	///
	/// If a target has any instructions that are commutable but require
	/// converting to different instructions or making non-trivial changes
	/// to commute them, this method can be overloaded to do that.
	/// The default implementation simply swaps the commutable operands.
	///
	/// If NewMI is false, MI is modified in place and returned; otherwise, a
	/// new machine instruction is created and returned.
	///
	/// Do not call this method for a non-commutable instruction.
	/// Even though the instruction is commutable, the method may still
	/// fail to commute the operands, null pointer is returned in such cases.
	virtual MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
	unsigned OpIdx1,
	unsigned OpIdx2) const;

	/// Assigns the (CommutableOpIdx1, CommutableOpIdx2) pair of commutable
	/// operand indices to (ResultIdx1, ResultIdx2).
	/// One or both input values of the pair: (ResultIdx1, ResultIdx2) may be
	/// predefined to some indices or be undefined (designated by the special
	/// value 'CommuteAnyOperandIndex').
	/// The predefined result indices cannot be re-defined.
	/// The function returns true iff after the result pair redefinition
	/// the fixed result pair is equal to or equivalent to the source pair of
	/// indices: (CommutableOpIdx1, CommutableOpIdx2). It is assumed here that
	/// the pairs (x,y) and (y,x) are equivalent.
	static bool fixCommutedOpIndices(unsigned &ResultIdx1, unsigned &ResultIdx2,
	unsigned CommutableOpIdx1,
	unsigned CommutableOpIdx2);

	private:
	/// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
	/// set and the target hook isReallyTriviallyReMaterializable returns false,
	/// this function does target-independent tests to determine if the
	/// instruction is really trivially rematerializable.
	bool isReallyTriviallyReMaterializableGeneric(const MachineInstr &MI,
	AliasAnalysis *AA) const;

	public:
	/// These methods return the opcode of the frame setup/destroy instructions
	/// if they exist (-1 otherwise). Some targets use pseudo instructions in
	/// order to abstract away the difference between operating with a frame
	/// pointer and operating without, through the use of these two instructions.
	///
	unsigned getCallFrameSetupOpcode() const { return CallFrameSetupOpcode; }
	unsigned getCallFrameDestroyOpcode() const { return CallFrameDestroyOpcode; }

	/// Returns true if the argument is a frame pseudo instruction.
	bool isFrameInstr(const MachineInstr &I) const {
	return I.getOpcode() == getCallFrameSetupOpcode() \|\|
	I.getOpcode() == getCallFrameDestroyOpcode();
	}

	/// Returns true if the argument is a frame setup pseudo instruction.
	bool isFrameSetup(const MachineInstr &I) const {
	return I.getOpcode() == getCallFrameSetupOpcode();
	}

	/// Returns size of the frame associated with the given frame instruction.
	/// For frame setup instruction this is frame that is set up space set up
	/// after the instruction. For frame destroy instruction this is the frame
	/// freed by the caller.
	/// Note, in some cases a call frame (or a part of it) may be prepared prior
	/// to the frame setup instruction. It occurs in the calls that involve
	/// inalloca arguments. This function reports only the size of the frame part
	/// that is set up between the frame setup and destroy pseudo instructions.
	int64_t getFrameSize(const MachineInstr &I) const {
	assert(isFrameInstr(I) && "Not a frame instruction");
	assert(I.getOperand(0).getImm() >= 0);
	return I.getOperand(0).getImm();
	}

	/// Returns the total frame size, which is made up of the space set up inside
	/// the pair of frame start-stop instructions and the space that is set up
	/// prior to the pair.
	int64_t getFrameTotalSize(const MachineInstr &I) const {
	if (isFrameSetup(I)) {
	assert(I.getOperand(1).getImm() >= 0 &&
	"Frame size must not be negative");
	return getFrameSize(I) + I.getOperand(1).getImm();
	}
	return getFrameSize(I);
	}

	unsigned getCatchReturnOpcode() const { return CatchRetOpcode; }
	unsigned getReturnOpcode() const { return ReturnOpcode; }

	/// Returns the actual stack pointer adjustment made by an instruction
	/// as part of a call sequence. By default, only call frame setup/destroy
	/// instructions adjust the stack, but targets may want to override this
	/// to enable more fine-grained adjustment, or adjust by a different value.
	virtual int getSPAdjust(const MachineInstr &MI) const;

	/// Return true if the instruction is a "coalescable" extension instruction.
	/// That is, it's like a copy where it's legal for the source to overlap the
	/// destination. e.g. X86::MOVSX64rr32. If this returns true, then it's
	/// expected the pre-extension value is available as a subreg of the result
	/// register. This also returns the sub-register index in SubIdx.
	virtual bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
	unsigned &DstReg, unsigned &SubIdx) const {
	return false;
	}

	/// If the specified machine instruction is a direct
	/// load from a stack slot, return the virtual or physical register number of
	/// the destination along with the FrameIndex of the loaded stack slot. If
	/// not, return 0. This predicate must return 0 if the instruction has
	/// any side effects other than loading from the stack slot.
	virtual unsigned isLoadFromStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	return 0;
	}

	/// Check for post-frame ptr elimination stack locations as well.
	/// This uses a heuristic so it isn't reliable for correctness.
	virtual unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
	int &FrameIndex) const {
	return 0;
	}

	/// If the specified machine instruction has a load from a stack slot,
	/// return true along with the FrameIndex of the loaded stack slot and the
	/// machine mem operand containing the reference.
	/// If not, return false. Unlike isLoadFromStackSlot, this returns true for
	/// any instructions that loads from the stack. This is just a hint, as some
	/// cases may be missed.
	virtual bool hasLoadFromStackSlot(const MachineInstr &MI,
	const MachineMemOperand *&MMO,
	int &FrameIndex) const;

	/// If the specified machine instruction is a direct
	/// store to a stack slot, return the virtual or physical register number of
	/// the source reg along with the FrameIndex of the loaded stack slot. If
	/// not, return 0. This predicate must return 0 if the instruction has
	/// any side effects other than storing to the stack slot.
	virtual unsigned isStoreToStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	return 0;
	}

	/// Check for post-frame ptr elimination stack locations as well.
	/// This uses a heuristic, so it isn't reliable for correctness.
	virtual unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
	int &FrameIndex) const {
	return 0;
	}

	/// If the specified machine instruction has a store to a stack slot,
	/// return true along with the FrameIndex of the loaded stack slot and the
	/// machine mem operand containing the reference.
	/// If not, return false. Unlike isStoreToStackSlot,
	/// this returns true for any instructions that stores to the
	/// stack. This is just a hint, as some cases may be missed.
	virtual bool hasStoreToStackSlot(const MachineInstr &MI,
	const MachineMemOperand *&MMO,
	int &FrameIndex) const;

	/// Return true if the specified machine instruction
	/// is a copy of one stack slot to another and has no other effect.
	/// Provide the identity of the two frame indices.
	virtual bool isStackSlotCopy(const MachineInstr &MI, int &DestFrameIndex,
	int &SrcFrameIndex) const {
	return false;
	}

	/// Compute the size in bytes and offset within a stack slot of a spilled
	/// register or subregister.
	///
	/// \param [out] Size in bytes of the spilled value.
	/// \param [out] Offset in bytes within the stack slot.
	/// \returns true if both Size and Offset are successfully computed.
	///
	/// Not all subregisters have computable spill slots. For example,
	/// subregisters registers may not be byte-sized, and a pair of discontiguous
	/// subregisters has no single offset.
	///
	/// Targets with nontrivial bigendian implementations may need to override
	/// this, particularly to support spilled vector registers.
	virtual bool getStackSlotRange(const TargetRegisterClass *RC, unsigned SubIdx,
	unsigned &Size, unsigned &Offset,
	const MachineFunction &MF) const;

	/// Returns the size in bytes of the specified MachineInstr, or ~0U
	/// when this function is not implemented by a target.
	virtual unsigned getInstSizeInBytes(const MachineInstr &MI) const {
	return ~0U;
	}

	/// Return true if the instruction is as cheap as a move instruction.
	///
	/// Targets for different archs need to override this, and different
	/// micro-architectures can also be finely tuned inside.
	virtual bool isAsCheapAsAMove(const MachineInstr &MI) const {
	return MI.isAsCheapAsAMove();
	}

	/// Return true if the instruction should be sunk by MachineSink.
	///
	/// MachineSink determines on its own whether the instruction is safe to sink;
	/// this gives the target a hook to override the default behavior with regards
	/// to which instructions should be sunk.
	virtual bool shouldSink(const MachineInstr &MI) const { return true; }

	/// Re-issue the specified 'original' instruction at the
	/// specific location targeting a new destination register.
	/// The register in Orig->getOperand(0).getReg() will be substituted by
	/// DestReg:SubIdx. Any existing subreg index is preserved or composed with
	/// SubIdx.
	virtual void reMaterialize(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI, unsigned DestReg,
	unsigned SubIdx, const MachineInstr &Orig,
	const TargetRegisterInfo &TRI) const;

	/// \brief Clones instruction or the whole instruction bundle \p Orig and
	/// insert into \p MBB before \p InsertBefore. The target may update operands
	/// that are required to be unique.
	///
	/// \p Orig must not return true for MachineInstr::isNotDuplicable().
	virtual MachineInstr &duplicate(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator InsertBefore,
	const MachineInstr &Orig) const;

	/// This method must be implemented by targets that
	/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
	/// may be able to convert a two-address instruction into one or more true
	/// three-address instructions on demand. This allows the X86 target (for
	/// example) to convert ADD and SHL instructions into LEA instructions if they
	/// would require register copies due to two-addressness.
	///
	/// This method returns a null pointer if the transformation cannot be
	/// performed, otherwise it returns the last new instruction.
	///
	virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
	MachineInstr &MI,
	LiveVariables *LV) const {
	return nullptr;
	}

	// This constant can be used as an input value of operand index passed to
	// the method findCommutedOpIndices() to tell the method that the
	// corresponding operand index is not pre-defined and that the method
	// can pick any commutable operand.
	static const unsigned CommuteAnyOperandIndex = ~0U;

	/// This method commutes the operands of the given machine instruction MI.
	///
	/// The operands to be commuted are specified by their indices OpIdx1 and
	/// OpIdx2. OpIdx1 and OpIdx2 arguments may be set to a special value
	/// 'CommuteAnyOperandIndex', which means that the method is free to choose
	/// any arbitrarily chosen commutable operand. If both arguments are set to
	/// 'CommuteAnyOperandIndex' then the method looks for 2 different commutable
	/// operands; then commutes them if such operands could be found.
	///
	/// If NewMI is false, MI is modified in place and returned; otherwise, a
	/// new machine instruction is created and returned.
	///
	/// Do not call this method for a non-commutable instruction or
	/// for non-commuable operands.
	/// Even though the instruction is commutable, the method may still
	/// fail to commute the operands, null pointer is returned in such cases.
	MachineInstr *
	commuteInstruction(MachineInstr &MI, bool NewMI = false,
	unsigned OpIdx1 = CommuteAnyOperandIndex,
	unsigned OpIdx2 = CommuteAnyOperandIndex) const;

	/// Returns true iff the routine could find two commutable operands in the
	/// given machine instruction.
	/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments.
	/// If any of the INPUT values is set to the special value
	/// 'CommuteAnyOperandIndex' then the method arbitrarily picks a commutable
	/// operand, then returns its index in the corresponding argument.
	/// If both of INPUT values are set to 'CommuteAnyOperandIndex' then method
	/// looks for 2 commutable operands.
	/// If INPUT values refer to some operands of MI, then the method simply
	/// returns true if the corresponding operands are commutable and returns
	/// false otherwise.
	///
	/// For example, calling this method this way:
	/// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
	/// findCommutedOpIndices(MI, Op1, Op2);
	/// can be interpreted as a query asking to find an operand that would be
	/// commutable with the operand#1.
	virtual bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
	unsigned &SrcOpIdx2) const;

	/// A pair composed of a register and a sub-register index.
	/// Used to give some type checking when modeling Reg:SubReg.
	struct RegSubRegPair {
	unsigned Reg;
	unsigned SubReg;

	RegSubRegPair(unsigned Reg = 0, unsigned SubReg = 0)
	: Reg(Reg), SubReg(SubReg) {}
	};

	/// A pair composed of a pair of a register and a sub-register index,
	/// and another sub-register index.
	/// Used to give some type checking when modeling Reg:SubReg1, SubReg2.
	struct RegSubRegPairAndIdx : RegSubRegPair {
	unsigned SubIdx;

	RegSubRegPairAndIdx(unsigned Reg = 0, unsigned SubReg = 0,
	unsigned SubIdx = 0)
	: RegSubRegPair(Reg, SubReg), SubIdx(SubIdx) {}
	};

	/// Build the equivalent inputs of a REG_SEQUENCE for the given \p MI
	/// and \p DefIdx.
	/// \p [out] InputRegs of the equivalent REG_SEQUENCE. Each element of
	/// the list is modeled as <Reg:SubReg, SubIdx>.
	/// E.g., REG_SEQUENCE %1:sub1, sub0, %2, sub1 would produce
	/// two elements:
	/// - %1:sub1, sub0
	/// - %2<:0>, sub1
	///
	/// \returns true if it is possible to build such an input sequence
	/// with the pair \p MI, \p DefIdx. False otherwise.
	///
	/// \pre MI.isRegSequence() or MI.isRegSequenceLike().
	///
	/// \note The generic implementation does not provide any support for
	/// MI.isRegSequenceLike(). In other words, one has to override
	/// getRegSequenceLikeInputs for target specific instructions.
	bool
	getRegSequenceInputs(const MachineInstr &MI, unsigned DefIdx,
	SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const;

	/// Build the equivalent inputs of a EXTRACT_SUBREG for the given \p MI
	/// and \p DefIdx.
	/// \p [out] InputReg of the equivalent EXTRACT_SUBREG.
	/// E.g., EXTRACT_SUBREG %1:sub1, sub0, sub1 would produce:
	/// - %1:sub1, sub0
	///
	/// \returns true if it is possible to build such an input sequence
	/// with the pair \p MI, \p DefIdx. False otherwise.
	///
	/// \pre MI.isExtractSubreg() or MI.isExtractSubregLike().
	///
	/// \note The generic implementation does not provide any support for
	/// MI.isExtractSubregLike(). In other words, one has to override
	/// getExtractSubregLikeInputs for target specific instructions.
	bool getExtractSubregInputs(const MachineInstr &MI, unsigned DefIdx,
	RegSubRegPairAndIdx &InputReg) const;

	/// Build the equivalent inputs of a INSERT_SUBREG for the given \p MI
	/// and \p DefIdx.
	/// \p [out] BaseReg and \p [out] InsertedReg contain
	/// the equivalent inputs of INSERT_SUBREG.
	/// E.g., INSERT_SUBREG %0:sub0, %1:sub1, sub3 would produce:
	/// - BaseReg: %0:sub0
	/// - InsertedReg: %1:sub1, sub3
	///
	/// \returns true if it is possible to build such an input sequence
	/// with the pair \p MI, \p DefIdx. False otherwise.
	///
	/// \pre MI.isInsertSubreg() or MI.isInsertSubregLike().
	///
	/// \note The generic implementation does not provide any support for
	/// MI.isInsertSubregLike(). In other words, one has to override
	/// getInsertSubregLikeInputs for target specific instructions.
	bool getInsertSubregInputs(const MachineInstr &MI, unsigned DefIdx,
	RegSubRegPair &BaseReg,
	RegSubRegPairAndIdx &InsertedReg) const;

	/// Return true if two machine instructions would produce identical values.
	/// By default, this is only true when the two instructions
	/// are deemed identical except for defs. If this function is called when the
	/// IR is still in SSA form, the caller can pass the MachineRegisterInfo for
	/// aggressive checks.
	virtual bool produceSameValue(const MachineInstr &MI0,
	const MachineInstr &MI1,
	const MachineRegisterInfo *MRI = nullptr) const;

	/// \returns true if a branch from an instruction with opcode \p BranchOpc
	/// bytes is capable of jumping to a position \p BrOffset bytes away.
	virtual bool isBranchOffsetInRange(unsigned BranchOpc,
	int64_t BrOffset) const {
	llvm_unreachable("target did not implement");
	}

	/// \returns The block that branch instruction \p MI jumps to.
	virtual MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const {
	llvm_unreachable("target did not implement");
	}

	/// Insert an unconditional indirect branch at the end of \p MBB to \p
	/// NewDestBB. \p BrOffset indicates the offset of \p NewDestBB relative to
	/// the offset of the position to insert the new branch.
	///
	/// \returns The number of bytes added to the block.
	virtual unsigned insertIndirectBranch(MachineBasicBlock &MBB,
	MachineBasicBlock &NewDestBB,
	const DebugLoc &DL,
	int64_t BrOffset = 0,
	RegScavenger *RS = nullptr) const {
	llvm_unreachable("target did not implement");
	}

	/// Analyze the branching code at the end of MBB, returning
	/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
	/// implemented for a target). Upon success, this returns false and returns
	/// with the following information in various cases:
	///
	/// 1. If this block ends with no branches (it just falls through to its succ)
	/// just return false, leaving TBB/FBB null.
	/// 2. If this block ends with only an unconditional branch, it sets TBB to be
	/// the destination block.
	/// 3. If this block ends with a conditional branch and it falls through to a
	/// successor block, it sets TBB to be the branch destination block and a
	/// list of operands that evaluate the condition. These operands can be
	/// passed to other TargetInstrInfo methods to create new branches.
	/// 4. If this block ends with a conditional branch followed by an
	/// unconditional branch, it returns the 'true' destination in TBB, the
	/// 'false' destination in FBB, and a list of operands that evaluate the
	/// condition. These operands can be passed to other TargetInstrInfo
	/// methods to create new branches.
	///
	/// Note that removeBranch and insertBranch must be implemented to support
	/// cases where this method returns success.
	///
	/// If AllowModify is true, then this routine is allowed to modify the basic
	/// block (e.g. delete instructions after the unconditional branch).
	///
	/// The CFG information in MBB.Predecessors and MBB.Successors must be valid
	/// before calling this function.
	virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
	MachineBasicBlock *&FBB,
	SmallVectorImpl<MachineOperand> &Cond,
	bool AllowModify = false) const {
	return true;
	}

	/// Represents a predicate at the MachineFunction level. The control flow a
	/// MachineBranchPredicate represents is:
	///
	/// Reg = LHS `Predicate` RHS == ConditionDef
	/// if Reg then goto TrueDest else goto FalseDest
	///
	struct MachineBranchPredicate {
	enum ComparePredicate {
	PRED_EQ, // True if two values are equal
	PRED_NE, // True if two values are not equal
	PRED_INVALID // Sentinel value
	};

	ComparePredicate Predicate = PRED_INVALID;
	MachineOperand LHS = MachineOperand::CreateImm(0);
	MachineOperand RHS = MachineOperand::CreateImm(0);
	MachineBasicBlock *TrueDest = nullptr;
	MachineBasicBlock *FalseDest = nullptr;
	MachineInstr *ConditionDef = nullptr;

	/// SingleUseCondition is true if ConditionDef is dead except for the
	/// branch(es) at the end of the basic block.
	///
	bool SingleUseCondition = false;

	explicit MachineBranchPredicate() = default;
	};

	/// Analyze the branching code at the end of MBB and parse it into the
	/// MachineBranchPredicate structure if possible. Returns false on success
	/// and true on failure.
	///
	/// If AllowModify is true, then this routine is allowed to modify the basic
	/// block (e.g. delete instructions after the unconditional branch).
	///
	virtual bool analyzeBranchPredicate(MachineBasicBlock &MBB,
	MachineBranchPredicate &MBP,
	bool AllowModify = false) const {
	return true;
	}

	/// Remove the branching code at the end of the specific MBB.
	/// This is only invoked in cases where AnalyzeBranch returns success. It
	/// returns the number of instructions that were removed.
	/// If \p BytesRemoved is non-null, report the change in code size from the
	/// removed instructions.
	virtual unsigned removeBranch(MachineBasicBlock &MBB,
	int *BytesRemoved = nullptr) const {
	llvm_unreachable("Target didn't implement TargetInstrInfo::removeBranch!");
	}

	/// Insert branch code into the end of the specified MachineBasicBlock. The
	/// operands to this method are the same as those returned by AnalyzeBranch.
	/// This is only invoked in cases where AnalyzeBranch returns success. It
	/// returns the number of instructions inserted. If \p BytesAdded is non-null,
	/// report the change in code size from the added instructions.
	///
	/// It is also invoked by tail merging to add unconditional branches in
	/// cases where AnalyzeBranch doesn't apply because there was no original
	/// branch to analyze. At least this much must be implemented, else tail
	/// merging needs to be disabled.
	///
	/// The CFG information in MBB.Predecessors and MBB.Successors must be valid
	/// before calling this function.
	virtual unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
	MachineBasicBlock *FBB,
	ArrayRef<MachineOperand> Cond,
	const DebugLoc &DL,
	int *BytesAdded = nullptr) const {
	llvm_unreachable("Target didn't implement TargetInstrInfo::insertBranch!");
	}

	unsigned insertUnconditionalBranch(MachineBasicBlock &MBB,
	MachineBasicBlock *DestBB,
	const DebugLoc &DL,
	int *BytesAdded = nullptr) const {
	return insertBranch(MBB, DestBB, nullptr, ArrayRef<MachineOperand>(), DL,
	BytesAdded);
	}

	/// Analyze the loop code, return true if it cannot be understoo. Upon
	/// success, this function returns false and returns information about the
	/// induction variable and compare instruction used at the end.
	virtual bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
	MachineInstr *&CmpInst) const {
	return true;
	}

	/// Generate code to reduce the loop iteration by one and check if the loop is
	/// finished. Return the value/register of the the new loop count. We need
	/// this function when peeling off one or more iterations of a loop. This
	/// function assumes the nth iteration is peeled first.
	virtual unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineInstr *IndVar,
	MachineInstr &Cmp,
	SmallVectorImpl<MachineOperand> &Cond,
	SmallVectorImpl<MachineInstr *> &PrevInsts,
	unsigned Iter, unsigned MaxIter) const {
	llvm_unreachable("Target didn't implement ReduceLoopCount");
	}

	/// Delete the instruction OldInst and everything after it, replacing it with
	/// an unconditional branch to NewDest. This is used by the tail merging pass.
	virtual void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
	MachineBasicBlock *NewDest) const;

	/// Return true if it's legal to split the given basic
	/// block at the specified instruction (i.e. instruction would be the start
	/// of a new basic block).
	virtual bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI) const {
	return true;
	}

	/// Return true if it's profitable to predicate
	/// instructions with accumulated instruction latency of "NumCycles"
	/// of the specified basic block, where the probability of the instructions
	/// being executed is given by Probability, and Confidence is a measure
	/// of our confidence that it will be properly predicted.
	virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
	unsigned ExtraPredCycles,
	BranchProbability Probability) const {
	return false;
	}

	/// Second variant of isProfitableToIfCvt. This one
	/// checks for the case where two basic blocks from true and false path
	/// of a if-then-else (diamond) are predicated on mutally exclusive
	/// predicates, where the probability of the true path being taken is given
	/// by Probability, and Confidence is a measure of our confidence that it
	/// will be properly predicted.
	virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTCycles,
	unsigned ExtraTCycles,
	MachineBasicBlock &FMBB, unsigned NumFCycles,
	unsigned ExtraFCycles,
	BranchProbability Probability) const {
	return false;
	}

	/// Return true if it's profitable for if-converter to duplicate instructions
	/// of specified accumulated instruction latencies in the specified MBB to
	/// enable if-conversion.
	/// The probability of the instructions being executed is given by
	/// Probability, and Confidence is a measure of our confidence that it
	/// will be properly predicted.
	virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
	unsigned NumCycles,
	BranchProbability Probability) const {
	return false;
	}

	/// Return true if it's profitable to unpredicate
	/// one side of a 'diamond', i.e. two sides of if-else predicated on mutually
	/// exclusive predicates.
	/// e.g.
	/// subeq r0, r1, #1
	/// addne r0, r1, #1
	/// =>
	/// sub r0, r1, #1
	/// addne r0, r1, #1
	///
	/// This may be profitable is conditional instructions are always executed.
	virtual bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
	MachineBasicBlock &FMBB) const {
	return false;
	}

	/// Return true if it is possible to insert a select
	/// instruction that chooses between TrueReg and FalseReg based on the
	/// condition code in Cond.
	///
	/// When successful, also return the latency in cycles from TrueReg,
	/// FalseReg, and Cond to the destination register. In most cases, a select
	/// instruction will be 1 cycle, so CondCycles = TrueCycles = FalseCycles = 1
	///
	/// Some x86 implementations have 2-cycle cmov instructions.
	///
	/// @param MBB Block where select instruction would be inserted.
	/// @param Cond Condition returned by AnalyzeBranch.
	/// @param TrueReg Virtual register to select when Cond is true.
	/// @param FalseReg Virtual register to select when Cond is false.
	/// @param CondCycles Latency from Cond+Branch to select output.
	/// @param TrueCycles Latency from TrueReg to select output.
	/// @param FalseCycles Latency from FalseReg to select output.
	virtual bool canInsertSelect(const MachineBasicBlock &MBB,
	ArrayRef<MachineOperand> Cond, unsigned TrueReg,
	unsigned FalseReg, int &CondCycles,
	int &TrueCycles, int &FalseCycles) const {
	return false;
	}

	/// Insert a select instruction into MBB before I that will copy TrueReg to
	/// DstReg when Cond is true, and FalseReg to DstReg when Cond is false.
	///
	/// This function can only be called after canInsertSelect() returned true.
	/// The condition in Cond comes from AnalyzeBranch, and it can be assumed
	/// that the same flags or registers required by Cond are available at the
	/// insertion point.
	///
	/// @param MBB Block where select instruction should be inserted.
	/// @param I Insertion point.
	/// @param DL Source location for debugging.
	/// @param DstReg Virtual register to be defined by select instruction.
	/// @param Cond Condition as computed by AnalyzeBranch.
	/// @param TrueReg Virtual register to copy when Cond is true.
	/// @param FalseReg Virtual register to copy when Cons is false.
	virtual void insertSelect(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I, const DebugLoc &DL,
	unsigned DstReg, ArrayRef<MachineOperand> Cond,
	unsigned TrueReg, unsigned FalseReg) const {
	llvm_unreachable("Target didn't implement TargetInstrInfo::insertSelect!");
	}

	/// Analyze the given select instruction, returning true if
	/// it cannot be understood. It is assumed that MI->isSelect() is true.
	///
	/// When successful, return the controlling condition and the operands that
	/// determine the true and false result values.
	///
	/// Result = SELECT Cond, TrueOp, FalseOp
	///
	/// Some targets can optimize select instructions, for example by predicating
	/// the instruction defining one of the operands. Such targets should set
	/// Optimizable.
	///
	/// @param MI Select instruction to analyze.
	/// @param Cond Condition controlling the select.
	/// @param TrueOp Operand number of the value selected when Cond is true.
	/// @param FalseOp Operand number of the value selected when Cond is false.
	/// @param Optimizable Returned as true if MI is optimizable.
	/// @returns False on success.
	virtual bool analyzeSelect(const MachineInstr &MI,
	SmallVectorImpl<MachineOperand> &Cond,
	unsigned &TrueOp, unsigned &FalseOp,
	bool &Optimizable) const {
	assert(MI.getDesc().isSelect() && "MI must be a select instruction");
	return true;
	}

	/// Given a select instruction that was understood by
	/// analyzeSelect and returned Optimizable = true, attempt to optimize MI by
	/// merging it with one of its operands. Returns NULL on failure.
	///
	/// When successful, returns the new select instruction. The client is
	/// responsible for deleting MI.
	///
	/// If both sides of the select can be optimized, PreferFalse is used to pick
	/// a side.
	///
	/// @param MI Optimizable select instruction.
	/// @param NewMIs Set that record all MIs in the basic block up to \p
	/// MI. Has to be updated with any newly created MI or deleted ones.
	/// @param PreferFalse Try to optimize FalseOp instead of TrueOp.
	/// @returns Optimized instruction or NULL.
	virtual MachineInstr *optimizeSelect(MachineInstr &MI,
	SmallPtrSetImpl<MachineInstr *> &NewMIs,
	bool PreferFalse = false) const {
	// This function must be implemented if Optimizable is ever set.
	llvm_unreachable("Target must implement TargetInstrInfo::optimizeSelect!");
	}

	/// Emit instructions to copy a pair of physical registers.
	///
	/// This function should support copies within any legal register class as
	/// well as any cross-class copies created during instruction selection.
	///
	/// The source and destination registers may overlap, which may require a
	/// careful implementation when multiple copy instructions are required for
	/// large registers. See for example the ARM target.
	virtual void copyPhysReg(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI, const DebugLoc &DL,
	unsigned DestReg, unsigned SrcReg,
	bool KillSrc) const {
	llvm_unreachable("Target didn't implement TargetInstrInfo::copyPhysReg!");
	}

	/// Store the specified register of the given register class to the specified
	/// stack frame index. The store instruction is to be added to the given
	/// machine basic block before the specified machine instruction. If isKill
	/// is true, the register operand is the last use and must be marked kill.
	virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	unsigned SrcReg, bool isKill, int FrameIndex,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	llvm_unreachable("Target didn't implement "
	"TargetInstrInfo::storeRegToStackSlot!");
	}

	/// Load the specified register of the given register class from the specified
	/// stack frame index. The load instruction is to be added to the given
	/// machine basic block before the specified machine instruction.
	virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	unsigned DestReg, int FrameIndex,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	llvm_unreachable("Target didn't implement "
	"TargetInstrInfo::loadRegFromStackSlot!");
	}

	/// This function is called for all pseudo instructions
	/// that remain after register allocation. Many pseudo instructions are
	/// created to help register allocation. This is the place to convert them
	/// into real instructions. The target can edit MI in place, or it can insert
	/// new instructions and erase MI. The function should return true if
	/// anything was changed.
	virtual bool expandPostRAPseudo(MachineInstr &MI) const { return false; }

	/// Check whether the target can fold a load that feeds a subreg operand
	/// (or a subreg operand that feeds a store).
	/// For example, X86 may want to return true if it can fold
	/// movl (%esp), %eax
	/// subb, %al, ...
	/// Into:
	/// subb (%esp), ...
	///
	/// Ideally, we'd like the target implementation of foldMemoryOperand() to
	/// reject subregs - but since this behavior used to be enforced in the
	/// target-independent code, moving this responsibility to the targets
	/// has the potential of causing nasty silent breakage in out-of-tree targets.
	virtual bool isSubregFoldable() const { return false; }

	/// Attempt to fold a load or store of the specified stack
	/// slot into the specified machine instruction for the specified operand(s).
	/// If this is possible, a new instruction is returned with the specified
	/// operand folded, otherwise NULL is returned.
	/// The new instruction is inserted before MI, and the client is responsible
	/// for removing the old instruction.
	MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef<unsigned> Ops,
	int FrameIndex,
	LiveIntervals *LIS = nullptr) const;

	/// Same as the previous version except it allows folding of any load and
	/// store from / to any address, not just from a specific stack slot.
	MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef<unsigned> Ops,
	MachineInstr &LoadMI,
	LiveIntervals *LIS = nullptr) const;

	/// Return true when there is potentially a faster code sequence
	/// for an instruction chain ending in \p Root. All potential patterns are
	/// returned in the \p Pattern vector. Pattern should be sorted in priority
	/// order since the pattern evaluator stops checking as soon as it finds a
	/// faster sequence.
	/// \param Root - Instruction that could be combined with one of its operands
	/// \param Patterns - Vector of possible combination patterns
	virtual bool getMachineCombinerPatterns(
	MachineInstr &Root,
	SmallVectorImpl<MachineCombinerPattern> &Patterns) const;

	/// Return true when a code sequence can improve throughput. It
	/// should be called only for instructions in loops.
	/// \param Pattern - combiner pattern
	virtual bool isThroughputPattern(MachineCombinerPattern Pattern) const;

	/// Return true if the input \P Inst is part of a chain of dependent ops
	/// that are suitable for reassociation, otherwise return false.
	/// If the instruction's operands must be commuted to have a previous
	/// instruction of the same type define the first source operand, \P Commuted
	/// will be set to true.
	bool isReassociationCandidate(const MachineInstr &Inst, bool &Commuted) const;

	/// Return true when \P Inst is both associative and commutative.
	virtual bool isAssociativeAndCommutative(const MachineInstr &Inst) const {
	return false;
	}

	/// Return true when \P Inst has reassociable operands in the same \P MBB.
	virtual bool hasReassociableOperands(const MachineInstr &Inst,
	const MachineBasicBlock *MBB) const;

	/// Return true when \P Inst has reassociable sibling.
	bool hasReassociableSibling(const MachineInstr &Inst, bool &Commuted) const;

	/// When getMachineCombinerPatterns() finds patterns, this function generates
	/// the instructions that could replace the original code sequence. The client
	/// has to decide whether the actual replacement is beneficial or not.
	/// \param Root - Instruction that could be combined with one of its operands
	/// \param Pattern - Combination pattern for Root
	/// \param InsInstrs - Vector of new instructions that implement P
	/// \param DelInstrs - Old instructions, including Root, that could be
	/// replaced by InsInstr
	/// \param InstrIdxForVirtReg - map of virtual register to instruction in
	/// InsInstr that defines it
	virtual void genAlternativeCodeSequence(
	MachineInstr &Root, MachineCombinerPattern Pattern,
	SmallVectorImpl<MachineInstr *> &InsInstrs,
	SmallVectorImpl<MachineInstr *> &DelInstrs,
	DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;

	/// Attempt to reassociate \P Root and \P Prev according to \P Pattern to
	/// reduce critical path length.
	void reassociateOps(MachineInstr &Root, MachineInstr &Prev,
	MachineCombinerPattern Pattern,
	SmallVectorImpl<MachineInstr *> &InsInstrs,
	SmallVectorImpl<MachineInstr *> &DelInstrs,
	DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;

	/// This is an architecture-specific helper function of reassociateOps.
	/// Set special operand attributes for new instructions after reassociation.
	virtual void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
	MachineInstr &NewMI1,
	MachineInstr &NewMI2) const {}

	/// Return true when a target supports MachineCombiner.
	virtual bool useMachineCombiner() const { return false; }

	+ /// Return true if the given SDNode can be copied during scheduling
	+ /// even if it has glue.
	+ virtual bool canCopyGluedNodeDuringSchedule(SDNode *N) const { return false; }
	+
	protected:
	/// Target-dependent implementation for foldMemoryOperand.
	/// Target-independent code in foldMemoryOperand will
	/// take care of adding a MachineMemOperand to the newly created instruction.
	/// The instruction and any auxiliary instructions necessary will be inserted
	/// at InsertPt.
	virtual MachineInstr *
	foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
	ArrayRef<unsigned> Ops,
	MachineBasicBlock::iterator InsertPt, int FrameIndex,
	LiveIntervals *LIS = nullptr) const {
	return nullptr;
	}

	/// Target-dependent implementation for foldMemoryOperand.
	/// Target-independent code in foldMemoryOperand will
	/// take care of adding a MachineMemOperand to the newly created instruction.
	/// The instruction and any auxiliary instructions necessary will be inserted
	/// at InsertPt.
	virtual MachineInstr *foldMemoryOperandImpl(
	MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
	MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
	LiveIntervals *LIS = nullptr) const {
	return nullptr;
	}

	/// \brief Target-dependent implementation of getRegSequenceInputs.
	///
	/// \returns true if it is possible to build the equivalent
	/// REG_SEQUENCE inputs with the pair \p MI, \p DefIdx. False otherwise.
	///
	/// \pre MI.isRegSequenceLike().
	///
	/// \see TargetInstrInfo::getRegSequenceInputs.
	virtual bool getRegSequenceLikeInputs(
	const MachineInstr &MI, unsigned DefIdx,
	SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
	return false;
	}

	/// \brief Target-dependent implementation of getExtractSubregInputs.
	///
	/// \returns true if it is possible to build the equivalent
	/// EXTRACT_SUBREG inputs with the pair \p MI, \p DefIdx. False otherwise.
	///
	/// \pre MI.isExtractSubregLike().
	///
	/// \see TargetInstrInfo::getExtractSubregInputs.
	virtual bool getExtractSubregLikeInputs(const MachineInstr &MI,
	unsigned DefIdx,
	RegSubRegPairAndIdx &InputReg) const {
	return false;
	}

	/// \brief Target-dependent implementation of getInsertSubregInputs.
	///
	/// \returns true if it is possible to build the equivalent
	/// INSERT_SUBREG inputs with the pair \p MI, \p DefIdx. False otherwise.
	///
	/// \pre MI.isInsertSubregLike().
	///
	/// \see TargetInstrInfo::getInsertSubregInputs.
	virtual bool
	getInsertSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx,
	RegSubRegPair &BaseReg,
	RegSubRegPairAndIdx &InsertedReg) const {
	return false;
	}

	public:
	/// getAddressSpaceForPseudoSourceKind - Given the kind of memory
	/// (e.g. stack) the target returns the corresponding address space.
	virtual unsigned
	getAddressSpaceForPseudoSourceKind(PseudoSourceValue::PSVKind Kind) const {
	return 0;
	}

	/// unfoldMemoryOperand - Separate a single instruction which folded a load or
	/// a store or a load and a store into two or more instruction. If this is
	/// possible, returns true as well as the new instructions by reference.
	virtual bool
	unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, unsigned Reg,
	bool UnfoldLoad, bool UnfoldStore,
	SmallVectorImpl<MachineInstr *> &NewMIs) const {
	return false;
	}

	virtual bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
	SmallVectorImpl<SDNode *> &NewNodes) const {
	return false;
	}

	/// Returns the opcode of the would be new
	/// instruction after load / store are unfolded from an instruction of the
	/// specified opcode. It returns zero if the specified unfolding is not
	/// possible. If LoadRegIndex is non-null, it is filled in with the operand
	/// index of the operand which will hold the register holding the loaded
	/// value.
	virtual unsigned
	getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore,
	unsigned *LoadRegIndex = nullptr) const {
	return 0;
	}

	/// This is used by the pre-regalloc scheduler to determine if two loads are
	/// loading from the same base address. It should only return true if the base
	/// pointers are the same and the only differences between the two addresses
	/// are the offset. It also returns the offsets by reference.
	virtual bool areLoadsFromSameBasePtr(SDNode Load1, SDNode Load2,
	int64_t &Offset1,
	int64_t &Offset2) const {
	return false;
	}

	/// This is a used by the pre-regalloc scheduler to determine (in conjunction
	/// with areLoadsFromSameBasePtr) if two loads should be scheduled together.
	/// On some targets if two loads are loading from
	/// addresses in the same cache line, it's better if they are scheduled
	/// together. This function takes two integers that represent the load offsets
	/// from the common base address. It returns true if it decides it's desirable
	/// to schedule the two loads together. "NumLoads" is the number of loads that
	/// have already been scheduled after Load1.
	virtual bool shouldScheduleLoadsNear(SDNode Load1, SDNode Load2,
	int64_t Offset1, int64_t Offset2,
	unsigned NumLoads) const {
	return false;
	}

	/// Get the base register and byte offset of an instruction that reads/writes
	/// memory.
	virtual bool getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
	int64_t &Offset,
	const TargetRegisterInfo *TRI) const {
	return false;
	}

	/// Return true if the instruction contains a base register and offset. If
	/// true, the function also sets the operand position in the instruction
	/// for the base register and offset.
	virtual bool getBaseAndOffsetPosition(const MachineInstr &MI,
	unsigned &BasePos,
	unsigned &OffsetPos) const {
	return false;
	}

	/// If the instruction is an increment of a constant value, return the amount.
	virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const {
	return false;
	}

	/// Returns true if the two given memory operations should be scheduled
	/// adjacent. Note that you have to add:
	/// DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
	/// or
	/// DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
	/// to TargetPassConfig::createMachineScheduler() to have an effect.
	virtual bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
	MachineInstr &SecondLdSt, unsigned BaseReg2,
	unsigned NumLoads) const {
	llvm_unreachable("target did not implement shouldClusterMemOps()");
	}

	/// Reverses the branch condition of the specified condition list,
	/// returning false on success and true if it cannot be reversed.
	virtual bool
	reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
	return true;
	}

	/// Insert a noop into the instruction stream at the specified point.
	virtual void insertNoop(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI) const;

	/// Return the noop instruction to use for a noop.
	virtual void getNoop(MCInst &NopInst) const;

	/// Return true for post-incremented instructions.
	virtual bool isPostIncrement(const MachineInstr &MI) const { return false; }

	/// Returns true if the instruction is already predicated.
	virtual bool isPredicated(const MachineInstr &MI) const { return false; }

	/// Returns true if the instruction is a
	/// terminator instruction that has not been predicated.
	virtual bool isUnpredicatedTerminator(const MachineInstr &MI) const;

	/// Returns true if MI is an unconditional tail call.
	virtual bool isUnconditionalTailCall(const MachineInstr &MI) const {
	return false;
	}

	/// Returns true if the tail call can be made conditional on BranchCond.
	virtual bool canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond,
	const MachineInstr &TailCall) const {
	return false;
	}

	/// Replace the conditional branch in MBB with a conditional tail call.
	virtual void replaceBranchWithTailCall(MachineBasicBlock &MBB,
	SmallVectorImpl<MachineOperand> &Cond,
	const MachineInstr &TailCall) const {
	llvm_unreachable("Target didn't implement replaceBranchWithTailCall!");
	}

	/// Convert the instruction into a predicated instruction.
	/// It returns true if the operation was successful.
	virtual bool PredicateInstruction(MachineInstr &MI,
	ArrayRef<MachineOperand> Pred) const;

	/// Returns true if the first specified predicate
	/// subsumes the second, e.g. GE subsumes GT.
	virtual bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
	ArrayRef<MachineOperand> Pred2) const {
	return false;
	}

	/// If the specified instruction defines any predicate
	/// or condition code register(s) used for predication, returns true as well
	/// as the definition predicate(s) by reference.
	virtual bool DefinesPredicate(MachineInstr &MI,
	std::vector<MachineOperand> &Pred) const {
	return false;
	}

	/// Return true if the specified instruction can be predicated.
	/// By default, this returns true for every instruction with a
	/// PredicateOperand.
	virtual bool isPredicable(const MachineInstr &MI) const {
	return MI.getDesc().isPredicable();
	}

	/// Return true if it's safe to move a machine
	/// instruction that defines the specified register class.
	virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
	return true;
	}

	/// Test if the given instruction should be considered a scheduling boundary.
	/// This primarily includes labels and terminators.
	virtual bool isSchedulingBoundary(const MachineInstr &MI,
	const MachineBasicBlock *MBB,
	const MachineFunction &MF) const;

	/// Measure the specified inline asm to determine an approximation of its
	/// length.
	virtual unsigned getInlineAsmLength(const char *Str,
	const MCAsmInfo &MAI) const;

	/// Allocate and return a hazard recognizer to use for this target when
	/// scheduling the machine instructions before register allocation.
	virtual ScheduleHazardRecognizer *
	CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
	const ScheduleDAG *DAG) const;

	/// Allocate and return a hazard recognizer to use for this target when
	/// scheduling the machine instructions before register allocation.
	virtual ScheduleHazardRecognizer *
	CreateTargetMIHazardRecognizer(const InstrItineraryData *,
	const ScheduleDAG *DAG) const;

	/// Allocate and return a hazard recognizer to use for this target when
	/// scheduling the machine instructions after register allocation.
	virtual ScheduleHazardRecognizer *
	CreateTargetPostRAHazardRecognizer(const InstrItineraryData *,
	const ScheduleDAG *DAG) const;

	/// Allocate and return a hazard recognizer to use for by non-scheduling
	/// passes.
	virtual ScheduleHazardRecognizer *
	CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
	return nullptr;
	}

	/// Provide a global flag for disabling the PreRA hazard recognizer that
	/// targets may choose to honor.
	bool usePreRAHazardRecognizer() const;

	/// For a comparison instruction, return the source registers
	/// in SrcReg and SrcReg2 if having two register operands, and the value it
	/// compares against in CmpValue. Return true if the comparison instruction
	/// can be analyzed.
	virtual bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
	unsigned &SrcReg2, int &Mask, int &Value) const {
	return false;
	}

	/// See if the comparison instruction can be converted
	/// into something more efficient. E.g., on ARM most instructions can set the
	/// flags register, obviating the need for a separate CMP.
	virtual bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
	unsigned SrcReg2, int Mask, int Value,
	const MachineRegisterInfo *MRI) const {
	return false;
	}
	virtual bool optimizeCondBranch(MachineInstr &MI) const { return false; }

	/// Try to remove the load by folding it to a register operand at the use.
	/// We fold the load instructions if and only if the
	/// def and use are in the same BB. We only look at one load and see
	/// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
	/// defined by the load we are trying to fold. DefMI returns the machine
	/// instruction that defines FoldAsLoadDefReg, and the function returns
	/// the machine instruction generated due to folding.
	virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI,
	const MachineRegisterInfo *MRI,
	unsigned &FoldAsLoadDefReg,
	MachineInstr *&DefMI) const {
	return nullptr;
	}

	/// 'Reg' is known to be defined by a move immediate instruction,
	/// try to fold the immediate into the use instruction.
	/// If MRI->hasOneNonDBGUse(Reg) is true, and this function returns true,
	/// then the caller may assume that DefMI has been erased from its parent
	/// block. The caller may assume that it will not be erased by this
	/// function otherwise.
	virtual bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
	unsigned Reg, MachineRegisterInfo *MRI) const {
	return false;
	}

	/// Return the number of u-operations the given machine
	/// instruction will be decoded to on the target cpu. The itinerary's
	/// IssueWidth is the number of microops that can be dispatched each
	/// cycle. An instruction with zero microops takes no dispatch resources.
	virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData,
	const MachineInstr &MI) const;

	/// Return true for pseudo instructions that don't consume any
	/// machine resources in their current form. These are common cases that the
	/// scheduler should consider free, rather than conservatively handling them
	/// as instructions with no itinerary.
	bool isZeroCost(unsigned Opcode) const {
	return Opcode <= TargetOpcode::COPY;
	}

	virtual int getOperandLatency(const InstrItineraryData *ItinData,
	SDNode *DefNode, unsigned DefIdx,
	SDNode *UseNode, unsigned UseIdx) const;

	/// Compute and return the use operand latency of a given pair of def and use.
	/// In most cases, the static scheduling itinerary was enough to determine the
	/// operand latency. But it may not be possible for instructions with variable
	/// number of defs / uses.
	///
	/// This is a raw interface to the itinerary that may be directly overridden
	/// by a target. Use computeOperandLatency to get the best estimate of
	/// latency.
	virtual int getOperandLatency(const InstrItineraryData *ItinData,
	const MachineInstr &DefMI, unsigned DefIdx,
	const MachineInstr &UseMI,
	unsigned UseIdx) const;

	/// Compute the instruction latency of a given instruction.
	/// If the instruction has higher cost when predicated, it's returned via
	/// PredCost.
	virtual unsigned getInstrLatency(const InstrItineraryData *ItinData,
	const MachineInstr &MI,
	unsigned *PredCost = nullptr) const;

	virtual unsigned getPredicationCost(const MachineInstr &MI) const;

	virtual int getInstrLatency(const InstrItineraryData *ItinData,
	SDNode *Node) const;

	/// Return the default expected latency for a def based on its opcode.
	unsigned defaultDefLatency(const MCSchedModel &SchedModel,
	const MachineInstr &DefMI) const;

	int computeDefOperandLatency(const InstrItineraryData *ItinData,
	const MachineInstr &DefMI) const;

	/// Return true if this opcode has high latency to its result.
	virtual bool isHighLatencyDef(int opc) const { return false; }

	/// Compute operand latency between a def of 'Reg'
	/// and a use in the current loop. Return true if the target considered
	/// it 'high'. This is used by optimization passes such as machine LICM to
	/// determine whether it makes sense to hoist an instruction out even in a
	/// high register pressure situation.
	virtual bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
	const MachineRegisterInfo *MRI,
	const MachineInstr &DefMI, unsigned DefIdx,
	const MachineInstr &UseMI,
	unsigned UseIdx) const {
	return false;
	}

	/// Compute operand latency of a def of 'Reg'. Return true
	/// if the target considered it 'low'.
	virtual bool hasLowDefLatency(const TargetSchedModel &SchedModel,
	const MachineInstr &DefMI,
	unsigned DefIdx) const;

	/// Perform target-specific instruction verification.
	virtual bool verifyInstruction(const MachineInstr &MI,
	StringRef &ErrInfo) const {
	return true;
	}

	/// Return the current execution domain and bit mask of
	/// possible domains for instruction.
	///
	/// Some micro-architectures have multiple execution domains, and multiple
	/// opcodes that perform the same operation in different domains. For
	/// example, the x86 architecture provides the por, orps, and orpd
	/// instructions that all do the same thing. There is a latency penalty if a
	/// register is written in one domain and read in another.
	///
	/// This function returns a pair (domain, mask) containing the execution
	/// domain of MI, and a bit mask of possible domains. The setExecutionDomain
	/// function can be used to change the opcode to one of the domains in the
	/// bit mask. Instructions whose execution domain can't be changed should
	/// return a 0 mask.
	///
	/// The execution domain numbers don't have any special meaning except domain
	/// 0 is used for instructions that are not associated with any interesting
	/// execution domain.
	///
	virtual std::pair<uint16_t, uint16_t>
	getExecutionDomain(const MachineInstr &MI) const {
	return std::make_pair(0, 0);
	}

	/// Change the opcode of MI to execute in Domain.
	///
	/// The bit (1 << Domain) must be set in the mask returned from
	/// getExecutionDomain(MI).
	virtual void setExecutionDomain(MachineInstr &MI, unsigned Domain) const {}

	/// Returns the preferred minimum clearance
	/// before an instruction with an unwanted partial register update.
	///
	/// Some instructions only write part of a register, and implicitly need to
	/// read the other parts of the register. This may cause unwanted stalls
	/// preventing otherwise unrelated instructions from executing in parallel in
	/// an out-of-order CPU.
	///
	/// For example, the x86 instruction cvtsi2ss writes its result to bits
	/// [31:0] of the destination xmm register. Bits [127:32] are unaffected, so
	/// the instruction needs to wait for the old value of the register to become
	/// available:
	///
	/// addps %xmm1, %xmm0
	/// movaps %xmm0, (%rax)
	/// cvtsi2ss %rbx, %xmm0
	///
	/// In the code above, the cvtsi2ss instruction needs to wait for the addps
	/// instruction before it can issue, even though the high bits of %xmm0
	/// probably aren't needed.
	///
	/// This hook returns the preferred clearance before MI, measured in
	/// instructions. Other defs of MI's operand OpNum are avoided in the last N
	/// instructions before MI. It should only return a positive value for
	/// unwanted dependencies. If the old bits of the defined register have
	/// useful values, or if MI is determined to otherwise read the dependency,
	/// the hook should return 0.
	///
	/// The unwanted dependency may be handled by:
	///
	/// 1. Allocating the same register for an MI def and use. That makes the
	/// unwanted dependency identical to a required dependency.
	///
	/// 2. Allocating a register for the def that has no defs in the previous N
	/// instructions.
	///
	/// 3. Calling breakPartialRegDependency() with the same arguments. This
	/// allows the target to insert a dependency breaking instruction.
	///
	virtual unsigned
	getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
	const TargetRegisterInfo *TRI) const {
	// The default implementation returns 0 for no partial register dependency.
	return 0;
	}

	/// \brief Return the minimum clearance before an instruction that reads an
	/// unused register.
	///
	/// For example, AVX instructions may copy part of a register operand into
	/// the unused high bits of the destination register.
	///
	/// vcvtsi2sdq %rax, undef %xmm0, %xmm14
	///
	/// In the code above, vcvtsi2sdq copies %xmm0[127:64] into %xmm14 creating a
	/// false dependence on any previous write to %xmm0.
	///
	/// This hook works similarly to getPartialRegUpdateClearance, except that it
	/// does not take an operand index. Instead sets \p OpNum to the index of the
	/// unused register.
	virtual unsigned getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
	const TargetRegisterInfo *TRI) const {
	// The default implementation returns 0 for no undef register dependency.
	return 0;
	}

	/// Insert a dependency-breaking instruction
	/// before MI to eliminate an unwanted dependency on OpNum.
	///
	/// If it wasn't possible to avoid a def in the last N instructions before MI
	/// (see getPartialRegUpdateClearance), this hook will be called to break the
	/// unwanted dependency.
	///
	/// On x86, an xorps instruction can be used as a dependency breaker:
	///
	/// addps %xmm1, %xmm0
	/// movaps %xmm0, (%rax)
	/// xorps %xmm0, %xmm0
	/// cvtsi2ss %rbx, %xmm0
	///
	/// An <imp-kill> operand should be added to MI if an instruction was
	/// inserted. This ties the instructions together in the post-ra scheduler.
	///
	virtual void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum,
	const TargetRegisterInfo *TRI) const {}

	/// Create machine specific model for scheduling.
	virtual DFAPacketizer *
	CreateTargetScheduleState(const TargetSubtargetInfo &) const {
	return nullptr;
	}

	/// Sometimes, it is possible for the target
	/// to tell, even without aliasing information, that two MIs access different
	/// memory addresses. This function returns true if two MIs access different
	/// memory addresses and false otherwise.
	///
	/// Assumes any physical registers used to compute addresses have the same
	/// value for both instructions. (This is the most useful assumption for
	/// post-RA scheduling.)
	///
	/// See also MachineInstr::mayAlias, which is implemented on top of this
	/// function.
	virtual bool
	areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
	AliasAnalysis *AA = nullptr) const {
	assert((MIa.mayLoad() \|\| MIa.mayStore()) &&
	"MIa must load from or modify a memory location");
	assert((MIb.mayLoad() \|\| MIb.mayStore()) &&
	"MIb must load from or modify a memory location");
	return false;
	}

	/// \brief Return the value to use for the MachineCSE's LookAheadLimit,
	/// which is a heuristic used for CSE'ing phys reg defs.
	virtual unsigned getMachineCSELookAheadLimit() const {
	// The default lookahead is small to prevent unprofitable quadratic
	// behavior.
	return 5;
	}

	/// Return an array that contains the ids of the target indices (used for the
	/// TargetIndex machine operand) and their names.
	///
	/// MIR Serialization is able to serialize only the target indices that are
	/// defined by this method.
	virtual ArrayRef<std::pair<int, const char *>>
	getSerializableTargetIndices() const {
	return None;
	}

	/// Decompose the machine operand's target flags into two values - the direct
	/// target flag value and any of bit flags that are applied.
	virtual std::pair<unsigned, unsigned>
	decomposeMachineOperandsTargetFlags(unsigned /TF/) const {
	return std::make_pair(0u, 0u);
	}

	/// Return an array that contains the direct target flag values and their
	/// names.
	///
	/// MIR Serialization is able to serialize only the target flags that are
	/// defined by this method.
	virtual ArrayRef<std::pair<unsigned, const char *>>
	getSerializableDirectMachineOperandTargetFlags() const {
	return None;
	}

	/// Return an array that contains the bitmask target flag values and their
	/// names.
	///
	/// MIR Serialization is able to serialize only the target flags that are
	/// defined by this method.
	virtual ArrayRef<std::pair<unsigned, const char *>>
	getSerializableBitmaskMachineOperandTargetFlags() const {
	return None;
	}

	/// Return an array that contains the MMO target flag values and their
	/// names.
	///
	/// MIR Serialization is able to serialize only the MMO target flags that are
	/// defined by this method.
	virtual ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
	getSerializableMachineMemOperandTargetFlags() const {
	return None;
	}

	/// Determines whether \p Inst is a tail call instruction. Override this
	/// method on targets that do not properly set MCID::Return and MCID::Call on
	/// tail call instructions."
	virtual bool isTailCall(const MachineInstr &Inst) const {
	return Inst.isReturn() && Inst.isCall();
	}

	/// True if the instruction is bound to the top of its basic block and no
	/// other instructions shall be inserted before it. This can be implemented
	/// to prevent register allocator to insert spills before such instructions.
	virtual bool isBasicBlockPrologue(const MachineInstr &MI) const {
	return false;
	}

	/// \brief Describes the number of instructions that it will take to call and
	/// construct a frame for a given outlining candidate.
	struct MachineOutlinerInfo {
	/// Number of instructions to call an outlined function for this candidate.
	unsigned CallOverhead;

	/// \brief Number of instructions to construct an outlined function frame
	/// for this candidate.
	unsigned FrameOverhead;

	/// \brief Represents the specific instructions that must be emitted to
	/// construct a call to this candidate.
	unsigned CallConstructionID;

	/// \brief Represents the specific instructions that must be emitted to
	/// construct a frame for this candidate's outlined function.
	unsigned FrameConstructionID;

	MachineOutlinerInfo() {}
	MachineOutlinerInfo(unsigned CallOverhead, unsigned FrameOverhead,
	unsigned CallConstructionID,
	unsigned FrameConstructionID)
	: CallOverhead(CallOverhead), FrameOverhead(FrameOverhead),
	CallConstructionID(CallConstructionID),
	FrameConstructionID(FrameConstructionID) {}
	};

	/// \brief Returns a \p MachineOutlinerInfo struct containing target-specific
	/// information for a set of outlining candidates.
	virtual MachineOutlinerInfo getOutlininingCandidateInfo(
	std::vector<
	std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
	&RepeatedSequenceLocs) const {
	llvm_unreachable(
	"Target didn't implement TargetInstrInfo::getOutliningOverhead!");
	}

	/// Represents how an instruction should be mapped by the outliner.
	/// \p Legal instructions are those which are safe to outline.
	/// \p Illegal instructions are those which cannot be outlined.
	/// \p Invisible instructions are instructions which can be outlined, but
	/// shouldn't actually impact the outlining result.
	enum MachineOutlinerInstrType { Legal, Illegal, Invisible };

	/// Returns how or if \p MI should be outlined.
	virtual MachineOutlinerInstrType getOutliningType(MachineInstr &MI) const {
	llvm_unreachable(
	"Target didn't implement TargetInstrInfo::getOutliningType!");
	}

	/// Insert a custom epilogue for outlined functions.
	/// This may be empty, in which case no epilogue or return statement will be
	/// emitted.
	virtual void insertOutlinerEpilogue(MachineBasicBlock &MBB,
	MachineFunction &MF,
	const MachineOutlinerInfo &MInfo) const {
	llvm_unreachable(
	"Target didn't implement TargetInstrInfo::insertOutlinerEpilogue!");
	}

	/// Insert a call to an outlined function into the program.
	/// Returns an iterator to the spot where we inserted the call. This must be
	/// implemented by the target.
	virtual MachineBasicBlock::iterator
	insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator &It, MachineFunction &MF,
	const MachineOutlinerInfo &MInfo) const {
	llvm_unreachable(
	"Target didn't implement TargetInstrInfo::insertOutlinedCall!");
	}

	/// Insert a custom prologue for outlined functions.
	/// This may be empty, in which case no prologue will be emitted.
	virtual void insertOutlinerPrologue(MachineBasicBlock &MBB,
	MachineFunction &MF,
	const MachineOutlinerInfo &MInfo) const {
	llvm_unreachable(
	"Target didn't implement TargetInstrInfo::insertOutlinerPrologue!");
	}

	/// Return true if the function can safely be outlined from.
	/// A function \p MF is considered safe for outlining if an outlined function
	/// produced from instructions in F will produce a program which produces the
	/// same output for any set of given inputs.
	virtual bool isFunctionSafeToOutlineFrom(MachineFunction &MF,
	bool OutlineFromLinkOnceODRs) const {
	llvm_unreachable("Target didn't implement "
	"TargetInstrInfo::isFunctionSafeToOutlineFrom!");
	}

	private:
	unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
	unsigned CatchRetOpcode;
	unsigned ReturnOpcode;
	};

	/// \brief Provide DenseMapInfo for TargetInstrInfo::RegSubRegPair.
	template <> struct DenseMapInfo<TargetInstrInfo::RegSubRegPair> {
	using RegInfo = DenseMapInfo<unsigned>;

	static inline TargetInstrInfo::RegSubRegPair getEmptyKey() {
	return TargetInstrInfo::RegSubRegPair(RegInfo::getEmptyKey(),
	RegInfo::getEmptyKey());
	}

	static inline TargetInstrInfo::RegSubRegPair getTombstoneKey() {
	return TargetInstrInfo::RegSubRegPair(RegInfo::getTombstoneKey(),
	RegInfo::getTombstoneKey());
	}

	/// \brief Reuse getHashValue implementation from
	/// std::pair<unsigned, unsigned>.
	static unsigned getHashValue(const TargetInstrInfo::RegSubRegPair &Val) {
	std::pair<unsigned, unsigned> PairVal = std::make_pair(Val.Reg, Val.SubReg);
	return DenseMapInfo<std::pair<unsigned, unsigned>>::getHashValue(PairVal);
	}

	static bool isEqual(const TargetInstrInfo::RegSubRegPair &LHS,
	const TargetInstrInfo::RegSubRegPair &RHS) {
	return RegInfo::isEqual(LHS.Reg, RHS.Reg) &&
	RegInfo::isEqual(LHS.SubReg, RHS.SubReg);
	}
	};

	} // end namespace llvm

	#endif // LLVM_TARGET_TARGETINSTRINFO_H
	Index: head/contrib/llvm/include/llvm/CodeGen/TargetLowering.h
	===================================================================
	--- head/contrib/llvm/include/llvm/CodeGen/TargetLowering.h (revision 328816)
	+++ head/contrib/llvm/include/llvm/CodeGen/TargetLowering.h (revision 328817)
	@@ -1,3542 +1,3542 @@
	//===- llvm/CodeGen/TargetLowering.h - Target Lowering Info ------ C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// This file describes how to lower LLVM code to machine code. This has two
	/// main components:
	///
	/// 1. Which ValueTypes are natively supported by the target.
	/// 2. Which operations are supported for supported ValueTypes.
	/// 3. Cost thresholds for alternative implementations of certain operations.
	///
	/// In addition it has a few other components, like information about FP
	/// immediates.
	///
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CODEGEN_TARGETLOWERING_H
	#define LLVM_CODEGEN_TARGETLOWERING_H

	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/CodeGen/DAGCombine.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetCallingConv.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Type.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Target/TargetMachine.h"
	#include <algorithm>
	#include <cassert>
	#include <climits>
	#include <cstdint>
	#include <iterator>
	#include <map>
	#include <string>
	#include <utility>
	#include <vector>

	namespace llvm {

	class BranchProbability;
	class CCState;
	class CCValAssign;
	class Constant;
	class FastISel;
	class FunctionLoweringInfo;
	class GlobalValue;
	class IntrinsicInst;
	struct KnownBits;
	class LLVMContext;
	class MachineBasicBlock;
	class MachineFunction;
	class MachineInstr;
	class MachineJumpTableInfo;
	class MachineLoop;
	class MachineRegisterInfo;
	class MCContext;
	class MCExpr;
	class Module;
	class TargetRegisterClass;
	class TargetLibraryInfo;
	class TargetRegisterInfo;
	class Value;

	namespace Sched {

	enum Preference {
	None, // No preference
	Source, // Follow source order.
	RegPressure, // Scheduling for lowest register pressure.
	Hybrid, // Scheduling for both latency and register pressure.
	ILP, // Scheduling for ILP in low register pressure mode.
	VLIW // Scheduling for VLIW targets.
	};

	} // end namespace Sched

	/// This base class for TargetLowering contains the SelectionDAG-independent
	/// parts that can be used from the rest of CodeGen.
	class TargetLoweringBase {
	public:
	/// This enum indicates whether operations are valid for a target, and if not,
	/// what action should be used to make them valid.
	enum LegalizeAction : uint8_t {
	Legal, // The target natively supports this operation.
	Promote, // This operation should be executed in a larger type.
	Expand, // Try to expand this to other ops, otherwise use a libcall.
	LibCall, // Don't try to expand this to other ops, always use a libcall.
	Custom // Use the LowerOperation hook to implement custom lowering.
	};

	/// This enum indicates whether a types are legal for a target, and if not,
	/// what action should be used to make them valid.
	enum LegalizeTypeAction : uint8_t {
	TypeLegal, // The target natively supports this type.
	TypePromoteInteger, // Replace this integer with a larger one.
	TypeExpandInteger, // Split this integer into two of half the size.
	TypeSoftenFloat, // Convert this float to a same size integer type,
	// if an operation is not supported in target HW.
	TypeExpandFloat, // Split this float into two of half the size.
	TypeScalarizeVector, // Replace this one-element vector with its element.
	TypeSplitVector, // Split this vector into two of half the size.
	TypeWidenVector, // This vector should be widened into a larger vector.
	TypePromoteFloat // Replace this float with a larger one.
	};

	/// LegalizeKind holds the legalization kind that needs to happen to EVT
	/// in order to type-legalize it.
	using LegalizeKind = std::pair<LegalizeTypeAction, EVT>;

	/// Enum that describes how the target represents true/false values.
	enum BooleanContent {
	UndefinedBooleanContent, // Only bit 0 counts, the rest can hold garbage.
	ZeroOrOneBooleanContent, // All bits zero except for bit 0.
	ZeroOrNegativeOneBooleanContent // All bits equal to bit 0.
	};

	/// Enum that describes what type of support for selects the target has.
	enum SelectSupportKind {
	ScalarValSelect, // The target supports scalar selects (ex: cmov).
	ScalarCondVectorVal, // The target supports selects with a scalar condition
	// and vector values (ex: cmov).
	VectorMaskSelect // The target supports vector selects with a vector
	// mask (ex: x86 blends).
	};

	/// Enum that specifies what an atomic load/AtomicRMWInst is expanded
	/// to, if at all. Exists because different targets have different levels of
	/// support for these atomic instructions, and also have different options
	/// w.r.t. what they should expand to.
	enum class AtomicExpansionKind {
	None, // Don't expand the instruction.
	LLSC, // Expand the instruction into loadlinked/storeconditional; used
	// by ARM/AArch64.
	LLOnly, // Expand the (load) instruction into just a load-linked, which has
	// greater atomic guarantees than a normal load.
	CmpXChg, // Expand the instruction into cmpxchg; used by at least X86.
	};

	/// Enum that specifies when a multiplication should be expanded.
	enum class MulExpansionKind {
	Always, // Always expand the instruction.
	OnlyLegalOrCustom, // Only expand when the resulting instructions are legal
	// or custom.
	};

	class ArgListEntry {
	public:
	Value *Val = nullptr;
	SDValue Node = SDValue();
	Type *Ty = nullptr;
	bool IsSExt : 1;
	bool IsZExt : 1;
	bool IsInReg : 1;
	bool IsSRet : 1;
	bool IsNest : 1;
	bool IsByVal : 1;
	bool IsInAlloca : 1;
	bool IsReturned : 1;
	bool IsSwiftSelf : 1;
	bool IsSwiftError : 1;
	uint16_t Alignment = 0;

	ArgListEntry()
	: IsSExt(false), IsZExt(false), IsInReg(false), IsSRet(false),
	IsNest(false), IsByVal(false), IsInAlloca(false), IsReturned(false),
	IsSwiftSelf(false), IsSwiftError(false) {}

	void setAttributes(ImmutableCallSite *CS, unsigned ArgIdx);
	};
	using ArgListTy = std::vector<ArgListEntry>;

	virtual void markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const {};

	static ISD::NodeType getExtendForContent(BooleanContent Content) {
	switch (Content) {
	case UndefinedBooleanContent:
	// Extend by adding rubbish bits.
	return ISD::ANY_EXTEND;
	case ZeroOrOneBooleanContent:
	// Extend by adding zero bits.
	return ISD::ZERO_EXTEND;
	case ZeroOrNegativeOneBooleanContent:
	// Extend by copying the sign bit.
	return ISD::SIGN_EXTEND;
	}
	llvm_unreachable("Invalid content kind");
	}

	/// NOTE: The TargetMachine owns TLOF.
	explicit TargetLoweringBase(const TargetMachine &TM);
	TargetLoweringBase(const TargetLoweringBase &) = delete;
	TargetLoweringBase &operator=(const TargetLoweringBase &) = delete;
	virtual ~TargetLoweringBase() = default;

	protected:
	/// \brief Initialize all of the actions to default values.
	void initActions();

	public:
	const TargetMachine &getTargetMachine() const { return TM; }

	virtual bool useSoftFloat() const { return false; }

	/// Return the pointer type for the given address space, defaults to
	/// the pointer type from the data layout.
	/// FIXME: The default needs to be removed once all the code is updated.
	MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const {
	return MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
	}

	/// Return the type for frame index, which is determined by
	/// the alloca address space specified through the data layout.
	MVT getFrameIndexTy(const DataLayout &DL) const {
	return getPointerTy(DL, DL.getAllocaAddrSpace());
	}

	/// Return the type for operands of fence.
	/// TODO: Let fence operands be of i32 type and remove this.
	virtual MVT getFenceOperandTy(const DataLayout &DL) const {
	return getPointerTy(DL);
	}

	/// EVT is not used in-tree, but is used by out-of-tree target.
	/// A documentation for this function would be nice...
	virtual MVT getScalarShiftAmountTy(const DataLayout &, EVT) const;

	EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const;

	/// Returns the type to be used for the index operand of:
	/// ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
	/// ISD::INSERT_SUBVECTOR, and ISD::EXTRACT_SUBVECTOR
	virtual MVT getVectorIdxTy(const DataLayout &DL) const {
	return getPointerTy(DL);
	}

	virtual bool isSelectSupported(SelectSupportKind /kind/) const {
	return true;
	}

	/// Return true if multiple condition registers are available.
	bool hasMultipleConditionRegisters() const {
	return HasMultipleConditionRegisters;
	}

	/// Return true if the target has BitExtract instructions.
	bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }

	/// Return the preferred vector type legalization action.
	virtual TargetLoweringBase::LegalizeTypeAction
	getPreferredVectorAction(EVT VT) const {
	// The default action for one element vectors is to scalarize
	if (VT.getVectorNumElements() == 1)
	return TypeScalarizeVector;
	// The default action for other vectors is to promote
	return TypePromoteInteger;
	}

	// There are two general methods for expanding a BUILD_VECTOR node:
	// 1. Use SCALAR_TO_VECTOR on the defined scalar values and then shuffle
	// them together.
	// 2. Build the vector on the stack and then load it.
	// If this function returns true, then method (1) will be used, subject to
	// the constraint that all of the necessary shuffles are legal (as determined
	// by isShuffleMaskLegal). If this function returns false, then method (2) is
	// always used. The vector type, and the number of defined values, are
	// provided.
	virtual bool
	shouldExpandBuildVectorWithShuffles(EVT /* VT */,
	unsigned DefinedValues) const {
	return DefinedValues < 3;
	}

	/// Return true if integer divide is usually cheaper than a sequence of
	/// several shifts, adds, and multiplies for this target.
	/// The definition of "cheaper" may depend on whether we're optimizing
	/// for speed or for size.
	virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const { return false; }

	/// Return true if the target can handle a standalone remainder operation.
	virtual bool hasStandaloneRem(EVT VT) const {
	return true;
	}

	/// Return true if SQRT(X) shouldn't be replaced with X*RSQRT(X).
	virtual bool isFsqrtCheap(SDValue X, SelectionDAG &DAG) const {
	// Default behavior is to replace SQRT(X) with X*RSQRT(X).
	return false;
	}

	/// Reciprocal estimate status values used by the functions below.
	enum ReciprocalEstimate : int {
	Unspecified = -1,
	Disabled = 0,
	Enabled = 1
	};

	/// Return a ReciprocalEstimate enum value for a square root of the given type
	/// based on the function's attributes. If the operation is not overridden by
	/// the function's attributes, "Unspecified" is returned and target defaults
	/// are expected to be used for instruction selection.
	int getRecipEstimateSqrtEnabled(EVT VT, MachineFunction &MF) const;

	/// Return a ReciprocalEstimate enum value for a division of the given type
	/// based on the function's attributes. If the operation is not overridden by
	/// the function's attributes, "Unspecified" is returned and target defaults
	/// are expected to be used for instruction selection.
	int getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF) const;

	/// Return the refinement step count for a square root of the given type based
	/// on the function's attributes. If the operation is not overridden by
	/// the function's attributes, "Unspecified" is returned and target defaults
	/// are expected to be used for instruction selection.
	int getSqrtRefinementSteps(EVT VT, MachineFunction &MF) const;

	/// Return the refinement step count for a division of the given type based
	/// on the function's attributes. If the operation is not overridden by
	/// the function's attributes, "Unspecified" is returned and target defaults
	/// are expected to be used for instruction selection.
	int getDivRefinementSteps(EVT VT, MachineFunction &MF) const;

	/// Returns true if target has indicated at least one type should be bypassed.
	bool isSlowDivBypassed() const { return !BypassSlowDivWidths.empty(); }

	/// Returns map of slow types for division or remainder with corresponding
	/// fast types
	const DenseMap<unsigned int, unsigned int> &getBypassSlowDivWidths() const {
	return BypassSlowDivWidths;
	}

	/// Return true if Flow Control is an expensive operation that should be
	/// avoided.
	bool isJumpExpensive() const { return JumpIsExpensive; }

	/// Return true if selects are only cheaper than branches if the branch is
	/// unlikely to be predicted right.
	bool isPredictableSelectExpensive() const {
	return PredictableSelectIsExpensive;
	}

	/// If a branch or a select condition is skewed in one direction by more than
	/// this factor, it is very likely to be predicted correctly.
	virtual BranchProbability getPredictableBranchThreshold() const;

	/// Return true if the following transform is beneficial:
	/// fold (conv (load x)) -> (load (conv*)x)
	/// On architectures that don't natively support some vector loads
	/// efficiently, casting the load to a smaller vector of larger types and
	/// loading is more efficient, however, this can be undone by optimizations in
	/// dag combiner.
	virtual bool isLoadBitCastBeneficial(EVT LoadVT,
	EVT BitcastVT) const {
	// Don't do if we could do an indexed load on the original type, but not on
	// the new one.
	if (!LoadVT.isSimple() \|\| !BitcastVT.isSimple())
	return true;

	MVT LoadMVT = LoadVT.getSimpleVT();

	// Don't bother doing this if it's just going to be promoted again later, as
	// doing so might interfere with other combines.
	if (getOperationAction(ISD::LOAD, LoadMVT) == Promote &&
	getTypeToPromoteTo(ISD::LOAD, LoadMVT) == BitcastVT.getSimpleVT())
	return false;

	return true;
	}

	/// Return true if the following transform is beneficial:
	/// (store (y (conv x)), y)) -> (store x, (x))
	virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT) const {
	// Default to the same logic as loads.
	return isLoadBitCastBeneficial(StoreVT, BitcastVT);
	}

	/// Return true if it is expected to be cheaper to do a store of a non-zero
	/// vector constant with the given size and type for the address space than to
	/// store the individual scalar element constants.
	virtual bool storeOfVectorConstantIsCheap(EVT MemVT,
	unsigned NumElem,
	unsigned AddrSpace) const {
	return false;
	}

	/// Allow store merging after legalization in addition to before legalization.
	/// This may catch stores that do not exist earlier (eg, stores created from
	/// intrinsics).
	virtual bool mergeStoresAfterLegalization() const { return true; }

	/// Returns if it's reasonable to merge stores to MemVT size.
	virtual bool canMergeStoresTo(unsigned AS, EVT MemVT,
	const SelectionDAG &DAG) const {
	return true;
	}

	/// \brief Return true if it is cheap to speculate a call to intrinsic cttz.
	virtual bool isCheapToSpeculateCttz() const {
	return false;
	}

	/// \brief Return true if it is cheap to speculate a call to intrinsic ctlz.
	virtual bool isCheapToSpeculateCtlz() const {
	return false;
	}

	/// \brief Return true if ctlz instruction is fast.
	virtual bool isCtlzFast() const {
	return false;
	}

	/// Return true if it is safe to transform an integer-domain bitwise operation
	/// into the equivalent floating-point operation. This should be set to true
	/// if the target has IEEE-754-compliant fabs/fneg operations for the input
	/// type.
	virtual bool hasBitPreservingFPLogic(EVT VT) const {
	return false;
	}

	/// \brief Return true if it is cheaper to split the store of a merged int val
	/// from a pair of smaller values into multiple stores.
	virtual bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const {
	return false;
	}

	/// \brief Return if the target supports combining a
	/// chain like:
	/// \code
	/// %andResult = and %val1, #mask
	/// %icmpResult = icmp %andResult, 0
	/// \endcode
	/// into a single machine instruction of a form like:
	/// \code
	/// cc = test %register, #mask
	/// \endcode
	virtual bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
	return false;
	}

	/// Use bitwise logic to make pairs of compares more efficient. For example:
	/// and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
	/// This should be true when it takes more than one instruction to lower
	/// setcc (cmp+set on x86 scalar), when bitwise ops are faster than logic on
	/// condition bits (crand on PowerPC), and/or when reducing cmp+br is a win.
	virtual bool convertSetCCLogicToBitwiseLogic(EVT VT) const {
	return false;
	}

	/// Return the preferred operand type if the target has a quick way to compare
	/// integer values of the given size. Assume that any legal integer type can
	/// be compared efficiently. Targets may override this to allow illegal wide
	/// types to return a vector type if there is support to compare that type.
	virtual MVT hasFastEqualityCompare(unsigned NumBits) const {
	MVT VT = MVT::getIntegerVT(NumBits);
	return isTypeLegal(VT) ? VT : MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	/// Return true if the target should transform:
	/// (X & Y) == Y ---> (~X & Y) == 0
	/// (X & Y) != Y ---> (~X & Y) != 0
	///
	/// This may be profitable if the target has a bitwise and-not operation that
	/// sets comparison flags. A target may want to limit the transformation based
	/// on the type of Y or if Y is a constant.
	///
	/// Note that the transform will not occur if Y is known to be a power-of-2
	/// because a mask and compare of a single bit can be handled by inverting the
	/// predicate, for example:
	/// (X & 8) == 8 ---> (X & 8) != 0
	virtual bool hasAndNotCompare(SDValue Y) const {
	return false;
	}

	/// Return true if the target has a bitwise and-not operation:
	/// X = ~A & B
	/// This can be used to simplify select or other instructions.
	virtual bool hasAndNot(SDValue X) const {
	// If the target has the more complex version of this operation, assume that
	// it has this operation too.
	return hasAndNotCompare(X);
	}

	/// \brief Return true if the target wants to use the optimization that
	/// turns ext(promotableInst1(...(promotableInstN(load)))) into
	/// promotedInst1(...(promotedInstN(ext(load)))).
	bool enableExtLdPromotion() const { return EnableExtLdPromotion; }

	/// Return true if the target can combine store(extractelement VectorTy,
	/// Idx).
	/// \p Cost[out] gives the cost of that transformation when this is true.
	virtual bool canCombineStoreAndExtract(Type VectorTy, Value Idx,
	unsigned &Cost) const {
	return false;
	}

	/// Return true if target supports floating point exceptions.
	bool hasFloatingPointExceptions() const {
	return HasFloatingPointExceptions;
	}

	/// Return true if target always beneficiates from combining into FMA for a
	/// given value type. This must typically return false on targets where FMA
	/// takes more cycles to execute than FADD.
	virtual bool enableAggressiveFMAFusion(EVT VT) const {
	return false;
	}

	/// Return the ValueType of the result of SETCC operations.
	virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
	EVT VT) const;

	/// Return the ValueType for comparison libcalls. Comparions libcalls include
	/// floating point comparion calls, and Ordered/Unordered check calls on
	/// floating point numbers.
	virtual
	MVT::SimpleValueType getCmpLibcallReturnType() const;

	/// For targets without i1 registers, this gives the nature of the high-bits
	/// of boolean values held in types wider than i1.
	///
	/// "Boolean values" are special true/false values produced by nodes like
	/// SETCC and consumed (as the condition) by nodes like SELECT and BRCOND.
	/// Not to be confused with general values promoted from i1. Some cpus
	/// distinguish between vectors of boolean and scalars; the isVec parameter
	/// selects between the two kinds. For example on X86 a scalar boolean should
	/// be zero extended from i1, while the elements of a vector of booleans
	/// should be sign extended from i1.
	///
	/// Some cpus also treat floating point types the same way as they treat
	/// vectors instead of the way they treat scalars.
	BooleanContent getBooleanContents(bool isVec, bool isFloat) const {
	if (isVec)
	return BooleanVectorContents;
	return isFloat ? BooleanFloatContents : BooleanContents;
	}

	BooleanContent getBooleanContents(EVT Type) const {
	return getBooleanContents(Type.isVector(), Type.isFloatingPoint());
	}

	/// Return target scheduling preference.
	Sched::Preference getSchedulingPreference() const {
	return SchedPreferenceInfo;
	}

	/// Some scheduler, e.g. hybrid, can switch to different scheduling heuristics
	/// for different nodes. This function returns the preference (or none) for
	/// the given node.
	virtual Sched::Preference getSchedulingPreference(SDNode *) const {
	return Sched::None;
	}

	/// Return the register class that should be used for the specified value
	/// type.
	virtual const TargetRegisterClass *getRegClassFor(MVT VT) const {
	const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
	assert(RC && "This value type is not natively supported!");
	return RC;
	}

	/// Return the 'representative' register class for the specified value
	/// type.
	///
	/// The 'representative' register class is the largest legal super-reg
	/// register class for the register class of the value type. For example, on
	/// i386 the rep register class for i8, i16, and i32 are GR32; while the rep
	/// register class is GR64 on x86_64.
	virtual const TargetRegisterClass *getRepRegClassFor(MVT VT) const {
	const TargetRegisterClass *RC = RepRegClassForVT[VT.SimpleTy];
	return RC;
	}

	/// Return the cost of the 'representative' register class for the specified
	/// value type.
	virtual uint8_t getRepRegClassCostFor(MVT VT) const {
	return RepRegClassCostForVT[VT.SimpleTy];
	}

	/// Return true if the target has native support for the specified value type.
	/// This means that it has a register that directly holds it without
	/// promotions or expansions.
	bool isTypeLegal(EVT VT) const {
	assert(!VT.isSimple() \|\|
	(unsigned)VT.getSimpleVT().SimpleTy < array_lengthof(RegClassForVT));
	return VT.isSimple() && RegClassForVT[VT.getSimpleVT().SimpleTy] != nullptr;
	}

	class ValueTypeActionImpl {
	/// ValueTypeActions - For each value type, keep a LegalizeTypeAction enum
	/// that indicates how instruction selection should deal with the type.
	LegalizeTypeAction ValueTypeActions[MVT::LAST_VALUETYPE];

	public:
	ValueTypeActionImpl() {
	std::fill(std::begin(ValueTypeActions), std::end(ValueTypeActions),
	TypeLegal);
	}

	LegalizeTypeAction getTypeAction(MVT VT) const {
	return ValueTypeActions[VT.SimpleTy];
	}

	void setTypeAction(MVT VT, LegalizeTypeAction Action) {
	ValueTypeActions[VT.SimpleTy] = Action;
	}
	};

	const ValueTypeActionImpl &getValueTypeActions() const {
	return ValueTypeActions;
	}

	/// Return how we should legalize values of this type, either it is already
	/// legal (return 'Legal') or we need to promote it to a larger type (return
	/// 'Promote'), or we need to expand it into multiple registers of smaller
	/// integer type (return 'Expand'). 'Custom' is not an option.
	LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const {
	return getTypeConversion(Context, VT).first;
	}
	LegalizeTypeAction getTypeAction(MVT VT) const {
	return ValueTypeActions.getTypeAction(VT);
	}

	/// For types supported by the target, this is an identity function. For
	/// types that must be promoted to larger types, this returns the larger type
	/// to promote to. For integer types that are larger than the largest integer
	/// register, this contains one step in the expansion to get to the smaller
	/// register. For illegal floating point types, this returns the integer type
	/// to transform to.
	EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const {
	return getTypeConversion(Context, VT).second;
	}

	/// For types supported by the target, this is an identity function. For
	/// types that must be expanded (i.e. integer types that are larger than the
	/// largest integer register or illegal floating point types), this returns
	/// the largest legal type it will be expanded to.
	EVT getTypeToExpandTo(LLVMContext &Context, EVT VT) const {
	assert(!VT.isVector());
	while (true) {
	switch (getTypeAction(Context, VT)) {
	case TypeLegal:
	return VT;
	case TypeExpandInteger:
	VT = getTypeToTransformTo(Context, VT);
	break;
	default:
	llvm_unreachable("Type is not legal nor is it to be expanded!");
	}
	}
	}

	/// Vector types are broken down into some number of legal first class types.
	/// For example, EVT::v8f32 maps to 2 EVT::v4f32 with Altivec or SSE1, or 8
	/// promoted EVT::f64 values with the X86 FP stack. Similarly, EVT::v2i64
	/// turns into 4 EVT::i32 values with both PPC and X86.
	///
	/// This method returns the number of registers needed, and the VT for each
	/// register. It also returns the VT and quantity of the intermediate values
	/// before they are promoted/expanded.
	unsigned getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
	EVT &IntermediateVT,
	unsigned &NumIntermediates,
	MVT &RegisterVT) const;

	/// Certain targets such as MIPS require that some types such as vectors are
	/// always broken down into scalars in some contexts. This occurs even if the
	/// vector type is legal.
	virtual unsigned getVectorTypeBreakdownForCallingConv(
	LLVMContext &Context, EVT VT, EVT &IntermediateVT,
	unsigned &NumIntermediates, MVT &RegisterVT) const {
	return getVectorTypeBreakdown(Context, VT, IntermediateVT, NumIntermediates,
	RegisterVT);
	}

	struct IntrinsicInfo {
	unsigned opc = 0; // target opcode
	EVT memVT; // memory VT

	// value representing memory location
	PointerUnion<const Value , const PseudoSourceValue > ptrVal;

	int offset = 0; // offset off of ptrVal
	unsigned size = 0; // the size of the memory location
	// (taken from memVT if zero)
	unsigned align = 1; // alignment

	MachineMemOperand::Flags flags = MachineMemOperand::MONone;
	IntrinsicInfo() = default;
	};

	/// Given an intrinsic, checks if on the target the intrinsic will need to map
	/// to a MemIntrinsicNode (touches memory). If this is the case, it returns
	/// true and store the intrinsic information into the IntrinsicInfo that was
	/// passed to the function.
	virtual bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
	MachineFunction &,
	unsigned /Intrinsic/) const {
	return false;
	}

	/// Returns true if the target can instruction select the specified FP
	/// immediate natively. If false, the legalizer will materialize the FP
	/// immediate as a load from a constant pool.
	virtual bool isFPImmLegal(const APFloat &/Imm/, EVT /VT/) const {
	return false;
	}

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
	/// target supports the VECTOR_SHUFFLE node, all mask values are assumed to be
	/// legal.
	virtual bool isShuffleMaskLegal(ArrayRef<int> /Mask/, EVT /VT/) const {
	return true;
	}

	/// Returns true if the operation can trap for the value type.
	///
	/// VT must be a legal type. By default, we optimistically assume most
	/// operations don't trap except for integer divide and remainder.
	virtual bool canOpTrap(unsigned Op, EVT VT) const;

	/// Similar to isShuffleMaskLegal. This is used by Targets can use this to
	/// indicate if there is a suitable VECTOR_SHUFFLE that can be used to replace
	/// a VAND with a constant pool entry.
	virtual bool isVectorClearMaskLegal(const SmallVectorImpl<int> &/Mask/,
	EVT /VT/) const {
	return false;
	}

	/// Return how this operation should be treated: either it is legal, needs to
	/// be promoted to a larger size, needs to be expanded to some other code
	/// sequence, or the target has a custom expander for it.
	LegalizeAction getOperationAction(unsigned Op, EVT VT) const {
	if (VT.isExtended()) return Expand;
	// If a target-specific SDNode requires legalization, require the target
	// to provide custom legalization for it.
	if (Op >= array_lengthof(OpActions[0])) return Custom;
	return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op];
	}

	/// Return true if the specified operation is legal on this target or can be
	/// made legal with custom lowering. This is used to help guide high-level
	/// lowering decisions.
	bool isOperationLegalOrCustom(unsigned Op, EVT VT) const {
	return (VT == MVT::Other \|\| isTypeLegal(VT)) &&
	(getOperationAction(Op, VT) == Legal \|\|
	getOperationAction(Op, VT) == Custom);
	}

	/// Return true if the specified operation is legal on this target or can be
	/// made legal using promotion. This is used to help guide high-level lowering
	/// decisions.
	bool isOperationLegalOrPromote(unsigned Op, EVT VT) const {
	return (VT == MVT::Other \|\| isTypeLegal(VT)) &&
	(getOperationAction(Op, VT) == Legal \|\|
	getOperationAction(Op, VT) == Promote);
	}

	/// Return true if the specified operation is legal on this target or can be
	/// made legal with custom lowering or using promotion. This is used to help
	/// guide high-level lowering decisions.
	bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT) const {
	return (VT == MVT::Other \|\| isTypeLegal(VT)) &&
	(getOperationAction(Op, VT) == Legal \|\|
	getOperationAction(Op, VT) == Custom \|\|
	getOperationAction(Op, VT) == Promote);
	}

	/// Return true if the operation uses custom lowering, regardless of whether
	/// the type is legal or not.
	bool isOperationCustom(unsigned Op, EVT VT) const {
	return getOperationAction(Op, VT) == Custom;
	}

	/// Return true if lowering to a jump table is allowed.
	- bool areJTsAllowed(const Function *Fn) const {
	+ virtual bool areJTsAllowed(const Function *Fn) const {
	if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true")
	return false;

	return isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) \|\|
	isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
	}

	/// Check whether the range [Low,High] fits in a machine word.
	bool rangeFitsInWord(const APInt &Low, const APInt &High,
	const DataLayout &DL) const {
	// FIXME: Using the pointer type doesn't seem ideal.
	uint64_t BW = DL.getPointerSizeInBits();
	uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1;
	return Range <= BW;
	}

	/// Return true if lowering to a jump table is suitable for a set of case
	/// clusters which may contain \p NumCases cases, \p Range range of values.
	/// FIXME: This function check the maximum table size and density, but the
	/// minimum size is not checked. It would be nice if the the minimum size is
	/// also combined within this function. Currently, the minimum size check is
	/// performed in findJumpTable() in SelectionDAGBuiler and
	/// getEstimatedNumberOfCaseClusters() in BasicTTIImpl.
	virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases,
	uint64_t Range) const {
	const bool OptForSize = SI->getParent()->getParent()->optForSize();
	const unsigned MinDensity = getMinimumJumpTableDensity(OptForSize);
	const unsigned MaxJumpTableSize =
	OptForSize \|\| getMaximumJumpTableSize() == 0
	? UINT_MAX
	: getMaximumJumpTableSize();
	// Check whether a range of clusters is dense enough for a jump table.
	if (Range <= MaxJumpTableSize &&
	(NumCases * 100 >= Range * MinDensity)) {
	return true;
	}
	return false;
	}

	/// Return true if lowering to a bit test is suitable for a set of case
	/// clusters which contains \p NumDests unique destinations, \p Low and
	/// \p High as its lowest and highest case values, and expects \p NumCmps
	/// case value comparisons. Check if the number of destinations, comparison
	/// metric, and range are all suitable.
	bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
	const APInt &Low, const APInt &High,
	const DataLayout &DL) const {
	// FIXME: I don't think NumCmps is the correct metric: a single case and a
	// range of cases both require only one branch to lower. Just looking at the
	// number of clusters and destinations should be enough to decide whether to
	// build bit tests.

	// To lower a range with bit tests, the range must fit the bitwidth of a
	// machine word.
	if (!rangeFitsInWord(Low, High, DL))
	return false;

	// Decide whether it's profitable to lower this range with bit tests. Each
	// destination requires a bit test and branch, and there is an overall range
	// check branch. For a small number of clusters, separate comparisons might
	// be cheaper, and for many destinations, splitting the range might be
	// better.
	return (NumDests == 1 && NumCmps >= 3) \|\| (NumDests == 2 && NumCmps >= 5) \|\|
	(NumDests == 3 && NumCmps >= 6);
	}

	/// Return true if the specified operation is illegal on this target or
	/// unlikely to be made legal with custom lowering. This is used to help guide
	/// high-level lowering decisions.
	bool isOperationExpand(unsigned Op, EVT VT) const {
	return (!isTypeLegal(VT) \|\| getOperationAction(Op, VT) == Expand);
	}

	/// Return true if the specified operation is legal on this target.
	bool isOperationLegal(unsigned Op, EVT VT) const {
	return (VT == MVT::Other \|\| isTypeLegal(VT)) &&
	getOperationAction(Op, VT) == Legal;
	}

	/// Return how this load with extension should be treated: either it is legal,
	/// needs to be promoted to a larger size, needs to be expanded to some other
	/// code sequence, or the target has a custom expander for it.
	LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT,
	EVT MemVT) const {
	if (ValVT.isExtended() \|\| MemVT.isExtended()) return Expand;
	unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy;
	unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
	assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValI < MVT::LAST_VALUETYPE &&
	MemI < MVT::LAST_VALUETYPE && "Table isn't big enough!");
	unsigned Shift = 4 * ExtType;
	return (LegalizeAction)((LoadExtActions[ValI][MemI] >> Shift) & 0xf);
	}

	/// Return true if the specified load with extension is legal on this target.
	bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const {
	return getLoadExtAction(ExtType, ValVT, MemVT) == Legal;
	}

	/// Return true if the specified load with extension is legal or custom
	/// on this target.
	bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const {
	return getLoadExtAction(ExtType, ValVT, MemVT) == Legal \|\|
	getLoadExtAction(ExtType, ValVT, MemVT) == Custom;
	}

	/// Return how this store with truncation should be treated: either it is
	/// legal, needs to be promoted to a larger size, needs to be expanded to some
	/// other code sequence, or the target has a custom expander for it.
	LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const {
	if (ValVT.isExtended() \|\| MemVT.isExtended()) return Expand;
	unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy;
	unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
	assert(ValI < MVT::LAST_VALUETYPE && MemI < MVT::LAST_VALUETYPE &&
	"Table isn't big enough!");
	return TruncStoreActions[ValI][MemI];
	}

	/// Return true if the specified store with truncation is legal on this
	/// target.
	bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const {
	return isTypeLegal(ValVT) && getTruncStoreAction(ValVT, MemVT) == Legal;
	}

	/// Return true if the specified store with truncation has solution on this
	/// target.
	bool isTruncStoreLegalOrCustom(EVT ValVT, EVT MemVT) const {
	return isTypeLegal(ValVT) &&
	(getTruncStoreAction(ValVT, MemVT) == Legal \|\|
	getTruncStoreAction(ValVT, MemVT) == Custom);
	}

	/// Return how the indexed load should be treated: either it is legal, needs
	/// to be promoted to a larger size, needs to be expanded to some other code
	/// sequence, or the target has a custom expander for it.
	LegalizeAction
	getIndexedLoadAction(unsigned IdxMode, MVT VT) const {
	assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
	"Table isn't big enough!");
	unsigned Ty = (unsigned)VT.SimpleTy;
	return (LegalizeAction)((IndexedModeActions[Ty][IdxMode] & 0xf0) >> 4);
	}

	/// Return true if the specified indexed load is legal on this target.
	bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const {
	return VT.isSimple() &&
	(getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Legal \|\|
	getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Custom);
	}

	/// Return how the indexed store should be treated: either it is legal, needs
	/// to be promoted to a larger size, needs to be expanded to some other code
	/// sequence, or the target has a custom expander for it.
	LegalizeAction
	getIndexedStoreAction(unsigned IdxMode, MVT VT) const {
	assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
	"Table isn't big enough!");
	unsigned Ty = (unsigned)VT.SimpleTy;
	return (LegalizeAction)(IndexedModeActions[Ty][IdxMode] & 0x0f);
	}

	/// Return true if the specified indexed load is legal on this target.
	bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const {
	return VT.isSimple() &&
	(getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Legal \|\|
	getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Custom);
	}

	/// Return how the condition code should be treated: either it is legal, needs
	/// to be expanded to some other code sequence, or the target has a custom
	/// expander for it.
	LegalizeAction
	getCondCodeAction(ISD::CondCode CC, MVT VT) const {
	assert((unsigned)CC < array_lengthof(CondCodeActions) &&
	((unsigned)VT.SimpleTy >> 3) < array_lengthof(CondCodeActions[0]) &&
	"Table isn't big enough!");
	// See setCondCodeAction for how this is encoded.
	uint32_t Shift = 4 * (VT.SimpleTy & 0x7);
	uint32_t Value = CondCodeActions[CC][VT.SimpleTy >> 3];
	LegalizeAction Action = (LegalizeAction) ((Value >> Shift) & 0xF);
	assert(Action != Promote && "Can't promote condition code!");
	return Action;
	}

	/// Return true if the specified condition code is legal on this target.
	bool isCondCodeLegal(ISD::CondCode CC, MVT VT) const {
	return
	getCondCodeAction(CC, VT) == Legal \|\|
	getCondCodeAction(CC, VT) == Custom;
	}

	/// If the action for this operation is to promote, this method returns the
	/// ValueType to promote to.
	MVT getTypeToPromoteTo(unsigned Op, MVT VT) const {
	assert(getOperationAction(Op, VT) == Promote &&
	"This operation isn't promoted!");

	// See if this has an explicit type specified.
	std::map<std::pair<unsigned, MVT::SimpleValueType>,
	MVT::SimpleValueType>::const_iterator PTTI =
	PromoteToType.find(std::make_pair(Op, VT.SimpleTy));
	if (PTTI != PromoteToType.end()) return PTTI->second;

	assert((VT.isInteger() \|\| VT.isFloatingPoint()) &&
	"Cannot autopromote this type, add it with AddPromotedToType.");

	MVT NVT = VT;
	do {
	NVT = (MVT::SimpleValueType)(NVT.SimpleTy+1);
	assert(NVT.isInteger() == VT.isInteger() && NVT != MVT::isVoid &&
	"Didn't find type to promote to!");
	} while (!isTypeLegal(NVT) \|\|
	getOperationAction(Op, NVT) == Promote);
	return NVT;
	}

	/// Return the EVT corresponding to this LLVM type. This is fixed by the LLVM
	/// operations except for the pointer size. If AllowUnknown is true, this
	/// will return MVT::Other for types with no EVT counterpart (e.g. structs),
	/// otherwise it will assert.
	EVT getValueType(const DataLayout &DL, Type *Ty,
	bool AllowUnknown = false) const {
	// Lower scalar pointers to native pointer types.
	if (PointerType *PTy = dyn_cast<PointerType>(Ty))
	return getPointerTy(DL, PTy->getAddressSpace());

	if (Ty->isVectorTy()) {
	VectorType *VTy = cast<VectorType>(Ty);
	Type *Elm = VTy->getElementType();
	// Lower vectors of pointers to native pointer types.
	if (PointerType *PT = dyn_cast<PointerType>(Elm)) {
	EVT PointerTy(getPointerTy(DL, PT->getAddressSpace()));
	Elm = PointerTy.getTypeForEVT(Ty->getContext());
	}

	return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(Elm, false),
	VTy->getNumElements());
	}
	return EVT::getEVT(Ty, AllowUnknown);
	}

	/// Return the MVT corresponding to this LLVM type. See getValueType.
	MVT getSimpleValueType(const DataLayout &DL, Type *Ty,
	bool AllowUnknown = false) const {
	return getValueType(DL, Ty, AllowUnknown).getSimpleVT();
	}

	/// Return the desired alignment for ByVal or InAlloca aggregate function
	/// arguments in the caller parameter area. This is the actual alignment, not
	/// its logarithm.
	virtual unsigned getByValTypeAlignment(Type *Ty, const DataLayout &DL) const;

	/// Return the type of registers that this ValueType will eventually require.
	MVT getRegisterType(MVT VT) const {
	assert((unsigned)VT.SimpleTy < array_lengthof(RegisterTypeForVT));
	return RegisterTypeForVT[VT.SimpleTy];
	}

	/// Return the type of registers that this ValueType will eventually require.
	MVT getRegisterType(LLVMContext &Context, EVT VT) const {
	if (VT.isSimple()) {
	assert((unsigned)VT.getSimpleVT().SimpleTy <
	array_lengthof(RegisterTypeForVT));
	return RegisterTypeForVT[VT.getSimpleVT().SimpleTy];
	}
	if (VT.isVector()) {
	EVT VT1;
	MVT RegisterVT;
	unsigned NumIntermediates;
	(void)getVectorTypeBreakdown(Context, VT, VT1,
	NumIntermediates, RegisterVT);
	return RegisterVT;
	}
	if (VT.isInteger()) {
	return getRegisterType(Context, getTypeToTransformTo(Context, VT));
	}
	llvm_unreachable("Unsupported extended type!");
	}

	/// Return the number of registers that this ValueType will eventually
	/// require.
	///
	/// This is one for any types promoted to live in larger registers, but may be
	/// more than one for types (like i64) that are split into pieces. For types
	/// like i140, which are first promoted then expanded, it is the number of
	/// registers needed to hold all the bits of the original type. For an i140
	/// on a 32 bit machine this means 5 registers.
	unsigned getNumRegisters(LLVMContext &Context, EVT VT) const {
	if (VT.isSimple()) {
	assert((unsigned)VT.getSimpleVT().SimpleTy <
	array_lengthof(NumRegistersForVT));
	return NumRegistersForVT[VT.getSimpleVT().SimpleTy];
	}
	if (VT.isVector()) {
	EVT VT1;
	MVT VT2;
	unsigned NumIntermediates;
	return getVectorTypeBreakdown(Context, VT, VT1, NumIntermediates, VT2);
	}
	if (VT.isInteger()) {
	unsigned BitWidth = VT.getSizeInBits();
	unsigned RegWidth = getRegisterType(Context, VT).getSizeInBits();
	return (BitWidth + RegWidth - 1) / RegWidth;
	}
	llvm_unreachable("Unsupported extended type!");
	}

	/// Certain combinations of ABIs, Targets and features require that types
	/// are legal for some operations and not for other operations.
	/// For MIPS all vector types must be passed through the integer register set.
	virtual MVT getRegisterTypeForCallingConv(MVT VT) const {
	return getRegisterType(VT);
	}

	virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context,
	EVT VT) const {
	return getRegisterType(Context, VT);
	}

	/// Certain targets require unusual breakdowns of certain types. For MIPS,
	/// this occurs when a vector type is used, as vector are passed through the
	/// integer register set.
	virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context,
	EVT VT) const {
	return getNumRegisters(Context, VT);
	}

	/// Certain targets have context senstive alignment requirements, where one
	/// type has the alignment requirement of another type.
	virtual unsigned getABIAlignmentForCallingConv(Type *ArgTy,
	DataLayout DL) const {
	return DL.getABITypeAlignment(ArgTy);
	}

	/// If true, then instruction selection should seek to shrink the FP constant
	/// of the specified type to a smaller type in order to save space and / or
	/// reduce runtime.
	virtual bool ShouldShrinkFPConstant(EVT) const { return true; }

	// Return true if it is profitable to reduce the given load node to a smaller
	// type.
	//
	// e.g. (i16 (trunc (i32 (load x))) -> i16 load x should be performed
	virtual bool shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	return true;
	}

	/// When splitting a value of the specified type into parts, does the Lo
	/// or Hi part come first? This usually follows the endianness, except
	/// for ppcf128, where the Hi part always comes first.
	bool hasBigEndianPartOrdering(EVT VT, const DataLayout &DL) const {
	return DL.isBigEndian() \|\| VT == MVT::ppcf128;
	}

	/// If true, the target has custom DAG combine transformations that it can
	/// perform for the specified node.
	bool hasTargetDAGCombine(ISD::NodeType NT) const {
	assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
	return TargetDAGCombineArray[NT >> 3] & (1 << (NT&7));
	}

	unsigned getGatherAllAliasesMaxDepth() const {
	return GatherAllAliasesMaxDepth;
	}

	/// Returns the size of the platform's va_list object.
	virtual unsigned getVaListSizeInBits(const DataLayout &DL) const {
	return getPointerTy(DL).getSizeInBits();
	}

	/// \brief Get maximum # of store operations permitted for llvm.memset
	///
	/// This function returns the maximum number of store operations permitted
	/// to replace a call to llvm.memset. The value is set by the target at the
	/// performance threshold for such a replacement. If OptSize is true,
	/// return the limit for functions that have OptSize attribute.
	unsigned getMaxStoresPerMemset(bool OptSize) const {
	return OptSize ? MaxStoresPerMemsetOptSize : MaxStoresPerMemset;
	}

	/// \brief Get maximum # of store operations permitted for llvm.memcpy
	///
	/// This function returns the maximum number of store operations permitted
	/// to replace a call to llvm.memcpy. The value is set by the target at the
	/// performance threshold for such a replacement. If OptSize is true,
	/// return the limit for functions that have OptSize attribute.
	unsigned getMaxStoresPerMemcpy(bool OptSize) const {
	return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
	}

	/// Get maximum # of load operations permitted for memcmp
	///
	/// This function returns the maximum number of load operations permitted
	/// to replace a call to memcmp. The value is set by the target at the
	/// performance threshold for such a replacement. If OptSize is true,
	/// return the limit for functions that have OptSize attribute.
	unsigned getMaxExpandSizeMemcmp(bool OptSize) const {
	return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp;
	}

	/// \brief Get maximum # of store operations permitted for llvm.memmove
	///
	/// This function returns the maximum number of store operations permitted
	/// to replace a call to llvm.memmove. The value is set by the target at the
	/// performance threshold for such a replacement. If OptSize is true,
	/// return the limit for functions that have OptSize attribute.
	unsigned getMaxStoresPerMemmove(bool OptSize) const {
	return OptSize ? MaxStoresPerMemmoveOptSize : MaxStoresPerMemmove;
	}

	/// \brief Determine if the target supports unaligned memory accesses.
	///
	/// This function returns true if the target allows unaligned memory accesses
	/// of the specified type in the given address space. If true, it also returns
	/// whether the unaligned memory access is "fast" in the last argument by
	/// reference. This is used, for example, in situations where an array
	/// copy/move/set is converted to a sequence of store operations. Its use
	/// helps to ensure that such replacements don't generate code that causes an
	/// alignment error (trap) on the target machine.
	virtual bool allowsMisalignedMemoryAccesses(EVT,
	unsigned AddrSpace = 0,
	unsigned Align = 1,
	bool * /Fast/ = nullptr) const {
	return false;
	}

	/// Return true if the target supports a memory access of this type for the
	/// given address space and alignment. If the access is allowed, the optional
	/// final parameter returns if the access is also fast (as defined by the
	/// target).
	bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
	unsigned AddrSpace = 0, unsigned Alignment = 1,
	bool *Fast = nullptr) const;

	/// Returns the target specific optimal type for load and store operations as
	/// a result of memset, memcpy, and memmove lowering.
	///
	/// If DstAlign is zero that means it's safe to destination alignment can
	/// satisfy any constraint. Similarly if SrcAlign is zero it means there isn't
	/// a need to check it against alignment requirement, probably because the
	/// source does not need to be loaded. If 'IsMemset' is true, that means it's
	/// expanding a memset. If 'ZeroMemset' is true, that means it's a memset of
	/// zero. 'MemcpyStrSrc' indicates whether the memcpy source is constant so it
	/// does not need to be loaded. It returns EVT::Other if the type should be
	/// determined using generic target-independent logic.
	virtual EVT getOptimalMemOpType(uint64_t /Size/,
	unsigned /DstAlign/, unsigned /SrcAlign/,
	bool /IsMemset/,
	bool /ZeroMemset/,
	bool /MemcpyStrSrc/,
	MachineFunction &/MF/) const {
	return MVT::Other;
	}

	/// Returns true if it's safe to use load / store of the specified type to
	/// expand memcpy / memset inline.
	///
	/// This is mostly true for all types except for some special cases. For
	/// example, on X86 targets without SSE2 f64 load / store are done with fldl /
	/// fstpl which also does type conversion. Note the specified type doesn't
	/// have to be legal as the hook is used before type legalization.
	virtual bool isSafeMemOpType(MVT /VT/) const { return true; }

	/// Determine if we should use _setjmp or setjmp to implement llvm.setjmp.
	bool usesUnderscoreSetJmp() const {
	return UseUnderscoreSetJmp;
	}

	/// Determine if we should use _longjmp or longjmp to implement llvm.longjmp.
	bool usesUnderscoreLongJmp() const {
	return UseUnderscoreLongJmp;
	}

	/// Return lower limit for number of blocks in a jump table.
	virtual unsigned getMinimumJumpTableEntries() const;

	/// Return lower limit of the density in a jump table.
	unsigned getMinimumJumpTableDensity(bool OptForSize) const;

	/// Return upper limit for number of entries in a jump table.
	/// Zero if no limit.
	unsigned getMaximumJumpTableSize() const;

	virtual bool isJumpTableRelative() const {
	return TM.isPositionIndependent();
	}

	/// If a physical register, this specifies the register that
	/// llvm.savestack/llvm.restorestack should save and restore.
	unsigned getStackPointerRegisterToSaveRestore() const {
	return StackPointerRegisterToSaveRestore;
	}

	/// If a physical register, this returns the register that receives the
	/// exception address on entry to an EH pad.
	virtual unsigned
	getExceptionPointerRegister(const Constant *PersonalityFn) const {
	// 0 is guaranteed to be the NoRegister value on all targets
	return 0;
	}

	/// If a physical register, this returns the register that receives the
	/// exception typeid on entry to a landing pad.
	virtual unsigned
	getExceptionSelectorRegister(const Constant *PersonalityFn) const {
	// 0 is guaranteed to be the NoRegister value on all targets
	return 0;
	}

	virtual bool needsFixedCatchObjects() const {
	report_fatal_error("Funclet EH is not implemented for this target");
	}

	/// Returns the target's jmp_buf size in bytes (if never set, the default is
	/// 200)
	unsigned getJumpBufSize() const {
	return JumpBufSize;
	}

	/// Returns the target's jmp_buf alignment in bytes (if never set, the default
	/// is 0)
	unsigned getJumpBufAlignment() const {
	return JumpBufAlignment;
	}

	/// Return the minimum stack alignment of an argument.
	unsigned getMinStackArgumentAlignment() const {
	return MinStackArgumentAlignment;
	}

	/// Return the minimum function alignment.
	unsigned getMinFunctionAlignment() const {
	return MinFunctionAlignment;
	}

	/// Return the preferred function alignment.
	unsigned getPrefFunctionAlignment() const {
	return PrefFunctionAlignment;
	}

	/// Return the preferred loop alignment.
	virtual unsigned getPrefLoopAlignment(MachineLoop *ML = nullptr) const {
	return PrefLoopAlignment;
	}

	/// If the target has a standard location for the stack protector guard,
	/// returns the address of that location. Otherwise, returns nullptr.
	/// DEPRECATED: please override useLoadStackGuardNode and customize
	/// LOAD_STACK_GUARD, or customize @llvm.stackguard().
	virtual Value *getIRStackGuard(IRBuilder<> &IRB) const;

	/// Inserts necessary declarations for SSP (stack protection) purpose.
	/// Should be used only when getIRStackGuard returns nullptr.
	virtual void insertSSPDeclarations(Module &M) const;

	/// Return the variable that's previously inserted by insertSSPDeclarations,
	/// if any, otherwise return nullptr. Should be used only when
	/// getIRStackGuard returns nullptr.
	virtual Value *getSDagStackGuard(const Module &M) const;

	/// If this function returns true, stack protection checks should XOR the
	/// frame pointer (or whichever pointer is used to address locals) into the
	/// stack guard value before checking it. getIRStackGuard must return nullptr
	/// if this returns true.
	virtual bool useStackGuardXorFP() const { return false; }

	/// If the target has a standard stack protection check function that
	/// performs validation and error handling, returns the function. Otherwise,
	/// returns nullptr. Must be previously inserted by insertSSPDeclarations.
	/// Should be used only when getIRStackGuard returns nullptr.
	virtual Value *getSSPStackGuardCheck(const Module &M) const;

	protected:
	Value *getDefaultSafeStackPointerLocation(IRBuilder<> &IRB,
	bool UseTLS) const;

	public:
	/// Returns the target-specific address of the unsafe stack pointer.
	virtual Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const;

	/// Returns the name of the symbol used to emit stack probes or the empty
	/// string if not applicable.
	virtual StringRef getStackProbeSymbolName(MachineFunction &MF) const {
	return "";
	}

	/// Returns true if a cast between SrcAS and DestAS is a noop.
	virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
	return false;
	}

	/// Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g. we
	/// are happy to sink it into basic blocks.
	virtual bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
	return isNoopAddrSpaceCast(SrcAS, DestAS);
	}

	/// Return true if the pointer arguments to CI should be aligned by aligning
	/// the object whose address is being passed. If so then MinSize is set to the
	/// minimum size the object must be to be aligned and PrefAlign is set to the
	/// preferred alignment.
	virtual bool shouldAlignPointerArgs(CallInst * /CI/, unsigned & /MinSize/,
	unsigned & /PrefAlign/) const {
	return false;
	}

	//===--------------------------------------------------------------------===//
	/// \name Helpers for TargetTransformInfo implementations
	/// @{

	/// Get the ISD node that corresponds to the Instruction class opcode.
	int InstructionOpcodeToISD(unsigned Opcode) const;

	/// Estimate the cost of type-legalization and the legalized type.
	std::pair<int, MVT> getTypeLegalizationCost(const DataLayout &DL,
	Type *Ty) const;

	/// @}

	//===--------------------------------------------------------------------===//
	/// \name Helpers for atomic expansion.
	/// @{

	/// Returns the maximum atomic operation size (in bits) supported by
	/// the backend. Atomic operations greater than this size (as well
	/// as ones that are not naturally aligned), will be expanded by
	/// AtomicExpandPass into an __atomic_* library call.
	unsigned getMaxAtomicSizeInBitsSupported() const {
	return MaxAtomicSizeInBitsSupported;
	}

	/// Returns the size of the smallest cmpxchg or ll/sc instruction
	/// the backend supports. Any smaller operations are widened in
	/// AtomicExpandPass.
	///
	/// Note that unlike operations above the maximum size, atomic ops
	/// are still natively supported below the minimum; they just
	/// require a more complex expansion.
	unsigned getMinCmpXchgSizeInBits() const { return MinCmpXchgSizeInBits; }

	/// Whether the target supports unaligned atomic operations.
	bool supportsUnalignedAtomics() const { return SupportsUnalignedAtomics; }

	/// Whether AtomicExpandPass should automatically insert fences and reduce
	/// ordering for this atomic. This should be true for most architectures with
	/// weak memory ordering. Defaults to false.
	virtual bool shouldInsertFencesForAtomic(const Instruction *I) const {
	return false;
	}

	/// Perform a load-linked operation on Addr, returning a "Value *" with the
	/// corresponding pointee type. This may entail some non-trivial operations to
	/// truncate or reconstruct types that will be illegal in the backend. See
	/// ARMISelLowering for an example implementation.
	virtual Value emitLoadLinked(IRBuilder<> &Builder, Value Addr,
	AtomicOrdering Ord) const {
	llvm_unreachable("Load linked unimplemented on this target");
	}

	/// Perform a store-conditional operation to Addr. Return the status of the
	/// store. This should be 0 if the store succeeded, non-zero otherwise.
	virtual Value emitStoreConditional(IRBuilder<> &Builder, Value Val,
	Value *Addr, AtomicOrdering Ord) const {
	llvm_unreachable("Store conditional unimplemented on this target");
	}

	/// Inserts in the IR a target-specific intrinsic specifying a fence.
	/// It is called by AtomicExpandPass before expanding an
	/// AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad
	/// if shouldInsertFencesForAtomic returns true.
	///
	/// Inst is the original atomic instruction, prior to other expansions that
	/// may be performed.
	///
	/// This function should either return a nullptr, or a pointer to an IR-level
	/// Instruction*. Even complex fence sequences can be represented by a
	/// single Instruction* through an intrinsic to be lowered later.
	/// Backends should override this method to produce target-specific intrinsic
	/// for their fences.
	/// FIXME: Please note that the default implementation here in terms of
	/// IR-level fences exists for historical/compatibility reasons and is
	/// unsound ! Fences cannot, in general, be used to restore sequential
	/// consistency. For example, consider the following example:
	/// atomic<int> x = y = 0;
	/// int r1, r2, r3, r4;
	/// Thread 0:
	/// x.store(1);
	/// Thread 1:
	/// y.store(1);
	/// Thread 2:
	/// r1 = x.load();
	/// r2 = y.load();
	/// Thread 3:
	/// r3 = y.load();
	/// r4 = x.load();
	/// r1 = r3 = 1 and r2 = r4 = 0 is impossible as long as the accesses are all
	/// seq_cst. But if they are lowered to monotonic accesses, no amount of
	/// IR-level fences can prevent it.
	/// @{
	virtual Instruction emitLeadingFence(IRBuilder<> &Builder, Instruction Inst,
	AtomicOrdering Ord) const {
	if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
	return Builder.CreateFence(Ord);
	else
	return nullptr;
	}

	virtual Instruction *emitTrailingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (isAcquireOrStronger(Ord))
	return Builder.CreateFence(Ord);
	else
	return nullptr;
	}
	/// @}

	// Emits code that executes when the comparison result in the ll/sc
	// expansion of a cmpxchg instruction is such that the store-conditional will
	// not execute. This makes it possible to balance out the load-linked with
	// a dedicated instruction, if desired.
	// E.g., on ARM, if ldrex isn't followed by strex, the exclusive monitor would
	// be unnecessarily held, except if clrex, inserted by this hook, is executed.
	virtual void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const {}

	/// Returns true if the given (atomic) store should be expanded by the
	/// IR-level AtomicExpand pass into an "atomic xchg" which ignores its input.
	virtual bool shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	return false;
	}

	/// Returns true if arguments should be sign-extended in lib calls.
	virtual bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
	return IsSigned;
	}

	/// Returns how the given (atomic) load should be expanded by the
	/// IR-level AtomicExpand pass.
	virtual AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	return AtomicExpansionKind::None;
	}

	/// Returns true if the given atomic cmpxchg should be expanded by the
	/// IR-level AtomicExpand pass into a load-linked/store-conditional sequence
	/// (through emitLoadLinked() and emitStoreConditional()).
	virtual bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
	return false;
	}

	/// Returns how the IR-level AtomicExpand pass should expand the given
	/// AtomicRMW, if at all. Default is to never expand.
	virtual AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const {
	return AtomicExpansionKind::None;
	}

	/// On some platforms, an AtomicRMW that never actually modifies the value
	/// (such as fetch_add of 0) can be turned into a fence followed by an
	/// atomic load. This may sound useless, but it makes it possible for the
	/// processor to keep the cacheline shared, dramatically improving
	/// performance. And such idempotent RMWs are useful for implementing some
	/// kinds of locks, see for example (justification + benchmarks):
	/// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
	/// This method tries doing that transformation, returning the atomic load if
	/// it succeeds, and nullptr otherwise.
	/// If shouldExpandAtomicLoadInIR returns true on that load, it will undergo
	/// another round of expansion.
	virtual LoadInst *
	lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *RMWI) const {
	return nullptr;
	}

	/// Returns how the platform's atomic operations are extended (ZERO_EXTEND,
	/// SIGN_EXTEND, or ANY_EXTEND).
	virtual ISD::NodeType getExtendForAtomicOps() const {
	return ISD::ZERO_EXTEND;
	}

	/// @}

	/// Returns true if we should normalize
	/// select(N0&N1, X, Y) => select(N0, select(N1, X, Y), Y) and
	/// select(N0\|N1, X, Y) => select(N0, select(N1, X, Y, Y)) if it is likely
	/// that it saves us from materializing N0 and N1 in an integer register.
	/// Targets that are able to perform and/or on flags should return false here.
	virtual bool shouldNormalizeToSelectSequence(LLVMContext &Context,
	EVT VT) const {
	// If a target has multiple condition registers, then it likely has logical
	// operations on those registers.
	if (hasMultipleConditionRegisters())
	return false;
	// Only do the transform if the value won't be split into multiple
	// registers.
	LegalizeTypeAction Action = getTypeAction(Context, VT);
	return Action != TypeExpandInteger && Action != TypeExpandFloat &&
	Action != TypeSplitVector;
	}

	/// Return true if a select of constants (select Cond, C1, C2) should be
	/// transformed into simple math ops with the condition value. For example:
	/// select Cond, C1, C1-1 --> add (zext Cond), C1-1
	virtual bool convertSelectOfConstantsToMath(EVT VT) const {
	return false;
	}

	//===--------------------------------------------------------------------===//
	// TargetLowering Configuration Methods - These methods should be invoked by
	// the derived class constructor to configure this object for the target.
	//
	protected:
	/// Specify how the target extends the result of integer and floating point
	/// boolean values from i1 to a wider type. See getBooleanContents.
	void setBooleanContents(BooleanContent Ty) {
	BooleanContents = Ty;
	BooleanFloatContents = Ty;
	}

	/// Specify how the target extends the result of integer and floating point
	/// boolean values from i1 to a wider type. See getBooleanContents.
	void setBooleanContents(BooleanContent IntTy, BooleanContent FloatTy) {
	BooleanContents = IntTy;
	BooleanFloatContents = FloatTy;
	}

	/// Specify how the target extends the result of a vector boolean value from a
	/// vector of i1 to a wider type. See getBooleanContents.
	void setBooleanVectorContents(BooleanContent Ty) {
	BooleanVectorContents = Ty;
	}

	/// Specify the target scheduling preference.
	void setSchedulingPreference(Sched::Preference Pref) {
	SchedPreferenceInfo = Pref;
	}

	/// Indicate whether this target prefers to use _setjmp to implement
	/// llvm.setjmp or the version without _. Defaults to false.
	void setUseUnderscoreSetJmp(bool Val) {
	UseUnderscoreSetJmp = Val;
	}

	/// Indicate whether this target prefers to use _longjmp to implement
	/// llvm.longjmp or the version without _. Defaults to false.
	void setUseUnderscoreLongJmp(bool Val) {
	UseUnderscoreLongJmp = Val;
	}

	/// Indicate the minimum number of blocks to generate jump tables.
	void setMinimumJumpTableEntries(unsigned Val);

	/// Indicate the maximum number of entries in jump tables.
	/// Set to zero to generate unlimited jump tables.
	void setMaximumJumpTableSize(unsigned);

	/// If set to a physical register, this specifies the register that
	/// llvm.savestack/llvm.restorestack should save and restore.
	void setStackPointerRegisterToSaveRestore(unsigned R) {
	StackPointerRegisterToSaveRestore = R;
	}

	/// Tells the code generator that the target has multiple (allocatable)
	/// condition registers that can be used to store the results of comparisons
	/// for use by selects and conditional branches. With multiple condition
	/// registers, the code generator will not aggressively sink comparisons into
	/// the blocks of their users.
	void setHasMultipleConditionRegisters(bool hasManyRegs = true) {
	HasMultipleConditionRegisters = hasManyRegs;
	}

	/// Tells the code generator that the target has BitExtract instructions.
	/// The code generator will aggressively sink "shift"s into the blocks of
	/// their users if the users will generate "and" instructions which can be
	/// combined with "shift" to BitExtract instructions.
	void setHasExtractBitsInsn(bool hasExtractInsn = true) {
	HasExtractBitsInsn = hasExtractInsn;
	}

	/// Tells the code generator not to expand logic operations on comparison
	/// predicates into separate sequences that increase the amount of flow
	/// control.
	void setJumpIsExpensive(bool isExpensive = true);

	/// Tells the code generator that this target supports floating point
	/// exceptions and cares about preserving floating point exception behavior.
	void setHasFloatingPointExceptions(bool FPExceptions = true) {
	HasFloatingPointExceptions = FPExceptions;
	}

	/// Tells the code generator which bitwidths to bypass.
	void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth) {
	BypassSlowDivWidths[SlowBitWidth] = FastBitWidth;
	}

	/// Add the specified register class as an available regclass for the
	/// specified value type. This indicates the selector can handle values of
	/// that class natively.
	void addRegisterClass(MVT VT, const TargetRegisterClass *RC) {
	assert((unsigned)VT.SimpleTy < array_lengthof(RegClassForVT));
	RegClassForVT[VT.SimpleTy] = RC;
	}

	/// Return the largest legal super-reg register class of the register class
	/// for the specified type and its associated "cost".
	virtual std::pair<const TargetRegisterClass *, uint8_t>
	findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const;

	/// Once all of the register classes are added, this allows us to compute
	/// derived properties we expose.
	void computeRegisterProperties(const TargetRegisterInfo *TRI);

	/// Indicate that the specified operation does not work with the specified
	/// type and indicate what to do about it. Note that VT may refer to either
	/// the type of a result or that of an operand of Op.
	void setOperationAction(unsigned Op, MVT VT,
	LegalizeAction Action) {
	assert(Op < array_lengthof(OpActions[0]) && "Table isn't big enough!");
	OpActions[(unsigned)VT.SimpleTy][Op] = Action;
	}

	/// Indicate that the specified load with extension does not work with the
	/// specified type and indicate what to do about it.
	void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT,
	LegalizeAction Action) {
	assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValVT.isValid() &&
	MemVT.isValid() && "Table isn't big enough!");
	assert((unsigned)Action < 0x10 && "too many bits for bitfield array");
	unsigned Shift = 4 * ExtType;
	LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] &= ~((uint16_t)0xF << Shift);
	LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] \|= (uint16_t)Action << Shift;
	}

	/// Indicate that the specified truncating store does not work with the
	/// specified type and indicate what to do about it.
	void setTruncStoreAction(MVT ValVT, MVT MemVT,
	LegalizeAction Action) {
	assert(ValVT.isValid() && MemVT.isValid() && "Table isn't big enough!");
	TruncStoreActions[(unsigned)ValVT.SimpleTy][MemVT.SimpleTy] = Action;
	}

	/// Indicate that the specified indexed load does or does not work with the
	/// specified type and indicate what to do abort it.
	///
	/// NOTE: All indexed mode loads are initialized to Expand in
	/// TargetLowering.cpp
	void setIndexedLoadAction(unsigned IdxMode, MVT VT,
	LegalizeAction Action) {
	assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
	(unsigned)Action < 0xf && "Table isn't big enough!");
	// Load action are kept in the upper half.
	IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0xf0;
	IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] \|= ((uint8_t)Action) <<4;
	}

	/// Indicate that the specified indexed store does or does not work with the
	/// specified type and indicate what to do about it.
	///
	/// NOTE: All indexed mode stores are initialized to Expand in
	/// TargetLowering.cpp
	void setIndexedStoreAction(unsigned IdxMode, MVT VT,
	LegalizeAction Action) {
	assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
	(unsigned)Action < 0xf && "Table isn't big enough!");
	// Store action are kept in the lower half.
	IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0x0f;
	IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] \|= ((uint8_t)Action);
	}

	/// Indicate that the specified condition code is or isn't supported on the
	/// target and indicate what to do about it.
	void setCondCodeAction(ISD::CondCode CC, MVT VT,
	LegalizeAction Action) {
	assert(VT.isValid() && (unsigned)CC < array_lengthof(CondCodeActions) &&
	"Table isn't big enough!");
	assert((unsigned)Action < 0x10 && "too many bits for bitfield array");
	/// The lower 3 bits of the SimpleTy index into Nth 4bit set from the 32-bit
	/// value and the upper 29 bits index into the second dimension of the array
	/// to select what 32-bit value to use.
	uint32_t Shift = 4 * (VT.SimpleTy & 0x7);
	CondCodeActions[CC][VT.SimpleTy >> 3] &= ~((uint32_t)0xF << Shift);
	CondCodeActions[CC][VT.SimpleTy >> 3] \|= (uint32_t)Action << Shift;
	}

	/// If Opc/OrigVT is specified as being promoted, the promotion code defaults
	/// to trying a larger integer/fp until it can find one that works. If that
	/// default is insufficient, this method can be used by the target to override
	/// the default.
	void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT) {
	PromoteToType[std::make_pair(Opc, OrigVT.SimpleTy)] = DestVT.SimpleTy;
	}

	/// Convenience method to set an operation to Promote and specify the type
	/// in a single call.
	void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT) {
	setOperationAction(Opc, OrigVT, Promote);
	AddPromotedToType(Opc, OrigVT, DestVT);
	}

	/// Targets should invoke this method for each target independent node that
	/// they want to provide a custom DAG combiner for by implementing the
	/// PerformDAGCombine virtual method.
	void setTargetDAGCombine(ISD::NodeType NT) {
	assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
	TargetDAGCombineArray[NT >> 3] \|= 1 << (NT&7);
	}

	/// Set the target's required jmp_buf buffer size (in bytes); default is 200
	void setJumpBufSize(unsigned Size) {
	JumpBufSize = Size;
	}

	/// Set the target's required jmp_buf buffer alignment (in bytes); default is
	/// 0
	void setJumpBufAlignment(unsigned Align) {
	JumpBufAlignment = Align;
	}

	/// Set the target's minimum function alignment (in log2(bytes))
	void setMinFunctionAlignment(unsigned Align) {
	MinFunctionAlignment = Align;
	}

	/// Set the target's preferred function alignment. This should be set if
	/// there is a performance benefit to higher-than-minimum alignment (in
	/// log2(bytes))
	void setPrefFunctionAlignment(unsigned Align) {
	PrefFunctionAlignment = Align;
	}

	/// Set the target's preferred loop alignment. Default alignment is zero, it
	/// means the target does not care about loop alignment. The alignment is
	/// specified in log2(bytes). The target may also override
	/// getPrefLoopAlignment to provide per-loop values.
	void setPrefLoopAlignment(unsigned Align) {
	PrefLoopAlignment = Align;
	}

	/// Set the minimum stack alignment of an argument (in log2(bytes)).
	void setMinStackArgumentAlignment(unsigned Align) {
	MinStackArgumentAlignment = Align;
	}

	/// Set the maximum atomic operation size supported by the
	/// backend. Atomic operations greater than this size (as well as
	/// ones that are not naturally aligned), will be expanded by
	/// AtomicExpandPass into an __atomic_* library call.
	void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits) {
	MaxAtomicSizeInBitsSupported = SizeInBits;
	}

	/// Sets the minimum cmpxchg or ll/sc size supported by the backend.
	void setMinCmpXchgSizeInBits(unsigned SizeInBits) {
	MinCmpXchgSizeInBits = SizeInBits;
	}

	/// Sets whether unaligned atomic operations are supported.
	void setSupportsUnalignedAtomics(bool UnalignedSupported) {
	SupportsUnalignedAtomics = UnalignedSupported;
	}

	public:
	//===--------------------------------------------------------------------===//
	// Addressing mode description hooks (used by LSR etc).
	//

	/// CodeGenPrepare sinks address calculations into the same BB as Load/Store
	/// instructions reading the address. This allows as much computation as
	/// possible to be done in the address mode for that operand. This hook lets
	/// targets also pass back when this should be done on intrinsics which
	/// load/store.
	virtual bool getAddrModeArguments(IntrinsicInst * /I/,
	SmallVectorImpl<Value> &/Ops*/,
	Type &/AccessTy*/) const {
	return false;
	}

	/// This represents an addressing mode of:
	/// BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
	/// If BaseGV is null, there is no BaseGV.
	/// If BaseOffs is zero, there is no base offset.
	/// If HasBaseReg is false, there is no base register.
	/// If Scale is zero, there is no ScaleReg. Scale of 1 indicates a reg with
	/// no scale.
	struct AddrMode {
	GlobalValue *BaseGV = nullptr;
	int64_t BaseOffs = 0;
	bool HasBaseReg = false;
	int64_t Scale = 0;
	AddrMode() = default;
	};

	/// Return true if the addressing mode represented by AM is legal for this
	/// target, for a load/store of the specified type.
	///
	/// The type may be VoidTy, in which case only return true if the addressing
	/// mode is legal for a load/store of any legal type. TODO: Handle
	/// pre/postinc as well.
	///
	/// If the address space cannot be determined, it will be -1.
	///
	/// TODO: Remove default argument
	virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
	Type *Ty, unsigned AddrSpace,
	Instruction *I = nullptr) const;

	/// \brief Return the cost of the scaling factor used in the addressing mode
	/// represented by AM for this target, for a load/store of the specified type.
	///
	/// If the AM is supported, the return value must be >= 0.
	/// If the AM is not supported, it returns a negative value.
	/// TODO: Handle pre/postinc as well.
	/// TODO: Remove default argument
	virtual int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM,
	Type *Ty, unsigned AS = 0) const {
	// Default: assume that any scaling factor used in a legal AM is free.
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	return 0;
	return -1;
	}

	/// Return true if the specified immediate is legal icmp immediate, that is
	/// the target has icmp instructions which can compare a register against the
	/// immediate without having to materialize the immediate into a register.
	virtual bool isLegalICmpImmediate(int64_t) const {
	return true;
	}

	/// Return true if the specified immediate is legal add immediate, that is the
	/// target has add instructions which can add a register with the immediate
	/// without having to materialize the immediate into a register.
	virtual bool isLegalAddImmediate(int64_t) const {
	return true;
	}

	/// Return true if it's significantly cheaper to shift a vector by a uniform
	/// scalar than by an amount which will vary across each lane. On x86, for
	/// example, there is a "psllw" instruction for the former case, but no simple
	/// instruction for a general "a << b" operation on vectors.
	virtual bool isVectorShiftByScalarCheap(Type *Ty) const {
	return false;
	}

	/// Returns true if the opcode is a commutative binary operation.
	virtual bool isCommutativeBinOp(unsigned Opcode) const {
	// FIXME: This should get its info from the td file.
	switch (Opcode) {
	case ISD::ADD:
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX:
	case ISD::MUL:
	case ISD::MULHU:
	case ISD::MULHS:
	case ISD::SMUL_LOHI:
	case ISD::UMUL_LOHI:
	case ISD::FADD:
	case ISD::FMUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::ADDC:
	case ISD::ADDE:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINNAN:
	case ISD::FMAXNAN:
	return true;
	default: return false;
	}
	}

	/// Return true if it's free to truncate a value of type FromTy to type
	/// ToTy. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
	/// by referencing its sub-register AX.
	/// Targets must return false when FromTy <= ToTy.
	virtual bool isTruncateFree(Type FromTy, Type ToTy) const {
	return false;
	}

	/// Return true if a truncation from FromTy to ToTy is permitted when deciding
	/// whether a call is in tail position. Typically this means that both results
	/// would be assigned to the same register or stack slot, but it could mean
	/// the target performs adequate checks of its own before proceeding with the
	/// tail call. Targets must return false when FromTy <= ToTy.
	virtual bool allowTruncateForTailCall(Type FromTy, Type ToTy) const {
	return false;
	}

	virtual bool isTruncateFree(EVT FromVT, EVT ToVT) const {
	return false;
	}

	virtual bool isProfitableToHoist(Instruction *I) const { return true; }

	/// Return true if the extension represented by \p I is free.
	/// Unlikely the is[Z\|FP]ExtFree family which is based on types,
	/// this method can use the context provided by \p I to decide
	/// whether or not \p I is free.
	/// This method extends the behavior of the is[Z\|FP]ExtFree family.
	/// In other words, if is[Z\|FP]Free returns true, then this method
	/// returns true as well. The converse is not true.
	/// The target can perform the adequate checks by overriding isExtFreeImpl.
	/// \pre \p I must be a sign, zero, or fp extension.
	bool isExtFree(const Instruction *I) const {
	switch (I->getOpcode()) {
	case Instruction::FPExt:
	if (isFPExtFree(EVT::getEVT(I->getType()),
	EVT::getEVT(I->getOperand(0)->getType())))
	return true;
	break;
	case Instruction::ZExt:
	if (isZExtFree(I->getOperand(0)->getType(), I->getType()))
	return true;
	break;
	case Instruction::SExt:
	break;
	default:
	llvm_unreachable("Instruction is not an extension");
	}
	return isExtFreeImpl(I);
	}

	/// Return true if \p Load and \p Ext can form an ExtLoad.
	/// For example, in AArch64
	/// %L = load i8, i8* %ptr
	/// %E = zext i8 %L to i32
	/// can be lowered into one load instruction
	/// ldrb w0, [x0]
	bool isExtLoad(const LoadInst Load, const Instruction Ext,
	const DataLayout &DL) const {
	EVT VT = getValueType(DL, Ext->getType());
	EVT LoadVT = getValueType(DL, Load->getType());

	// If the load has other users and the truncate is not free, the ext
	// probably isn't free.
	if (!Load->hasOneUse() && (isTypeLegal(LoadVT) \|\| !isTypeLegal(VT)) &&
	!isTruncateFree(Ext->getType(), Load->getType()))
	return false;

	// Check whether the target supports casts folded into loads.
	unsigned LType;
	if (isa<ZExtInst>(Ext))
	LType = ISD::ZEXTLOAD;
	else {
	assert(isa<SExtInst>(Ext) && "Unexpected ext type!");
	LType = ISD::SEXTLOAD;
	}

	return isLoadExtLegal(LType, VT, LoadVT);
	}

	/// Return true if any actual instruction that defines a value of type FromTy
	/// implicitly zero-extends the value to ToTy in the result register.
	///
	/// The function should return true when it is likely that the truncate can
	/// be freely folded with an instruction defining a value of FromTy. If
	/// the defining instruction is unknown (because you're looking at a
	/// function argument, PHI, etc.) then the target may require an
	/// explicit truncate, which is not necessarily free, but this function
	/// does not deal with those cases.
	/// Targets must return false when FromTy >= ToTy.
	virtual bool isZExtFree(Type FromTy, Type ToTy) const {
	return false;
	}

	virtual bool isZExtFree(EVT FromTy, EVT ToTy) const {
	return false;
	}

	/// Return true if the target supplies and combines to a paired load
	/// two loaded values of type LoadedType next to each other in memory.
	/// RequiredAlignment gives the minimal alignment constraints that must be met
	/// to be able to select this paired load.
	///
	/// This information is not used to generate actual paired loads, but it is
	/// used to generate a sequence of loads that is easier to combine into a
	/// paired load.
	/// For instance, something like this:
	/// a = load i64* addr
	/// b = trunc i64 a to i32
	/// c = lshr i64 a, 32
	/// d = trunc i64 c to i32
	/// will be optimized into:
	/// b = load i32* addr1
	/// d = load i32* addr2
	/// Where addr1 = addr2 +/- sizeof(i32).
	///
	/// In other words, unless the target performs a post-isel load combining,
	/// this information should not be provided because it will generate more
	/// loads.
	virtual bool hasPairedLoad(EVT /LoadedType/,
	unsigned & /RequiredAlignment/) const {
	return false;
	}

	/// \brief Get the maximum supported factor for interleaved memory accesses.
	/// Default to be the minimum interleave factor: 2.
	virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }

	/// \brief Lower an interleaved load to target specific intrinsics. Return
	/// true on success.
	///
	/// \p LI is the vector load instruction.
	/// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
	/// \p Indices is the corresponding indices for each shufflevector.
	/// \p Factor is the interleave factor.
	virtual bool lowerInterleavedLoad(LoadInst *LI,
	ArrayRef<ShuffleVectorInst *> Shuffles,
	ArrayRef<unsigned> Indices,
	unsigned Factor) const {
	return false;
	}

	/// \brief Lower an interleaved store to target specific intrinsics. Return
	/// true on success.
	///
	/// \p SI is the vector store instruction.
	/// \p SVI is the shufflevector to RE-interleave the stored vector.
	/// \p Factor is the interleave factor.
	virtual bool lowerInterleavedStore(StoreInst SI, ShuffleVectorInst SVI,
	unsigned Factor) const {
	return false;
	}

	/// Return true if zero-extending the specific node Val to type VT2 is free
	/// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
	/// because it's folded such as X86 zero-extending loads).
	virtual bool isZExtFree(SDValue Val, EVT VT2) const {
	return isZExtFree(Val.getValueType(), VT2);
	}

	/// Return true if an fpext operation is free (for instance, because
	/// single-precision floating-point numbers are implicitly extended to
	/// double-precision).
	virtual bool isFPExtFree(EVT DestVT, EVT SrcVT) const {
	assert(SrcVT.isFloatingPoint() && DestVT.isFloatingPoint() &&
	"invalid fpext types");
	return false;
	}

	/// Return true if an fpext operation input to an \p Opcode operation is free
	/// (for instance, because half-precision floating-point numbers are
	/// implicitly extended to float-precision) for an FMA instruction.
	virtual bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const {
	assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
	"invalid fpext types");
	return isFPExtFree(DestVT, SrcVT);
	}

	/// Return true if folding a vector load into ExtVal (a sign, zero, or any
	/// extend node) is profitable.
	virtual bool isVectorLoadExtDesirable(SDValue ExtVal) const { return false; }

	/// Return true if an fneg operation is free to the point where it is never
	/// worthwhile to replace it with a bitwise operation.
	virtual bool isFNegFree(EVT VT) const {
	assert(VT.isFloatingPoint());
	return false;
	}

	/// Return true if an fabs operation is free to the point where it is never
	/// worthwhile to replace it with a bitwise operation.
	virtual bool isFAbsFree(EVT VT) const {
	assert(VT.isFloatingPoint());
	return false;
	}

	/// Return true if an FMA operation is faster than a pair of fmul and fadd
	/// instructions. fmuladd intrinsics will be expanded to FMAs when this method
	/// returns true, otherwise fmuladd is expanded to fmul + fadd.
	///
	/// NOTE: This may be called before legalization on types for which FMAs are
	/// not legal, but should return true if those types will eventually legalize
	/// to types that support FMAs. After legalization, it will only be called on
	/// types that support FMAs (via Legal or Custom actions)
	virtual bool isFMAFasterThanFMulAndFAdd(EVT) const {
	return false;
	}

	/// Return true if it's profitable to narrow operations of type VT1 to
	/// VT2. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
	/// i32 to i16.
	virtual bool isNarrowingProfitable(EVT /VT1/, EVT /VT2/) const {
	return false;
	}

	/// \brief Return true if it is beneficial to convert a load of a constant to
	/// just the constant itself.
	/// On some targets it might be more efficient to use a combination of
	/// arithmetic instructions to materialize the constant instead of loading it
	/// from a constant pool.
	virtual bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	return false;
	}

	/// Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type
	/// from this source type with this index. This is needed because
	/// EXTRACT_SUBVECTOR usually has custom lowering that depends on the index of
	/// the first element, and only the target knows which lowering is cheap.
	virtual bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	return false;
	}

	// Return true if it is profitable to use a scalar input to a BUILD_VECTOR
	// even if the vector itself has multiple uses.
	virtual bool aggressivelyPreferBuildVectorSources(EVT VecVT) const {
	return false;
	}

	//===--------------------------------------------------------------------===//
	// Runtime Library hooks
	//

	/// Rename the default libcall routine name for the specified libcall.
	void setLibcallName(RTLIB::Libcall Call, const char *Name) {
	LibcallRoutineNames[Call] = Name;
	}

	/// Get the libcall routine name for the specified libcall.
	const char *getLibcallName(RTLIB::Libcall Call) const {
	return LibcallRoutineNames[Call];
	}

	/// Override the default CondCode to be used to test the result of the
	/// comparison libcall against zero.
	void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) {
	CmpLibcallCCs[Call] = CC;
	}

	/// Get the CondCode that's to be used to test the result of the comparison
	/// libcall against zero.
	ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const {
	return CmpLibcallCCs[Call];
	}

	/// Set the CallingConv that should be used for the specified libcall.
	void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
	LibcallCallingConvs[Call] = CC;
	}

	/// Get the CallingConv that should be used for the specified libcall.
	CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
	return LibcallCallingConvs[Call];
	}

	/// Execute target specific actions to finalize target lowering.
	/// This is used to set extra flags in MachineFrameInformation and freezing
	/// the set of reserved registers.
	/// The default implementation just freezes the set of reserved registers.
	virtual void finalizeLowering(MachineFunction &MF) const;

	private:
	const TargetMachine &TM;

	/// Tells the code generator that the target has multiple (allocatable)
	/// condition registers that can be used to store the results of comparisons
	/// for use by selects and conditional branches. With multiple condition
	/// registers, the code generator will not aggressively sink comparisons into
	/// the blocks of their users.
	bool HasMultipleConditionRegisters;

	/// Tells the code generator that the target has BitExtract instructions.
	/// The code generator will aggressively sink "shift"s into the blocks of
	/// their users if the users will generate "and" instructions which can be
	/// combined with "shift" to BitExtract instructions.
	bool HasExtractBitsInsn;

	/// Tells the code generator to bypass slow divide or remainder
	/// instructions. For example, BypassSlowDivWidths[32,8] tells the code
	/// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer
	/// div/rem when the operands are positive and less than 256.
	DenseMap <unsigned int, unsigned int> BypassSlowDivWidths;

	/// Tells the code generator that it shouldn't generate extra flow control
	/// instructions and should attempt to combine flow control instructions via
	/// predication.
	bool JumpIsExpensive;

	/// Whether the target supports or cares about preserving floating point
	/// exception behavior.
	bool HasFloatingPointExceptions;

	/// This target prefers to use _setjmp to implement llvm.setjmp.
	///
	/// Defaults to false.
	bool UseUnderscoreSetJmp;

	/// This target prefers to use _longjmp to implement llvm.longjmp.
	///
	/// Defaults to false.
	bool UseUnderscoreLongJmp;

	/// Information about the contents of the high-bits in boolean values held in
	/// a type wider than i1. See getBooleanContents.
	BooleanContent BooleanContents;

	/// Information about the contents of the high-bits in boolean values held in
	/// a type wider than i1. See getBooleanContents.
	BooleanContent BooleanFloatContents;

	/// Information about the contents of the high-bits in boolean vector values
	/// when the element type is wider than i1. See getBooleanContents.
	BooleanContent BooleanVectorContents;

	/// The target scheduling preference: shortest possible total cycles or lowest
	/// register usage.
	Sched::Preference SchedPreferenceInfo;

	/// The size, in bytes, of the target's jmp_buf buffers
	unsigned JumpBufSize;

	/// The alignment, in bytes, of the target's jmp_buf buffers
	unsigned JumpBufAlignment;

	/// The minimum alignment that any argument on the stack needs to have.
	unsigned MinStackArgumentAlignment;

	/// The minimum function alignment (used when optimizing for size, and to
	/// prevent explicitly provided alignment from leading to incorrect code).
	unsigned MinFunctionAlignment;

	/// The preferred function alignment (used when alignment unspecified and
	/// optimizing for speed).
	unsigned PrefFunctionAlignment;

	/// The preferred loop alignment.
	unsigned PrefLoopAlignment;

	/// Size in bits of the maximum atomics size the backend supports.
	/// Accesses larger than this will be expanded by AtomicExpandPass.
	unsigned MaxAtomicSizeInBitsSupported;

	/// Size in bits of the minimum cmpxchg or ll/sc operation the
	/// backend supports.
	unsigned MinCmpXchgSizeInBits;

	/// This indicates if the target supports unaligned atomic operations.
	bool SupportsUnalignedAtomics;

	/// If set to a physical register, this specifies the register that
	/// llvm.savestack/llvm.restorestack should save and restore.
	unsigned StackPointerRegisterToSaveRestore;

	/// This indicates the default register class to use for each ValueType the
	/// target supports natively.
	const TargetRegisterClass *RegClassForVT[MVT::LAST_VALUETYPE];
	unsigned char NumRegistersForVT[MVT::LAST_VALUETYPE];
	MVT RegisterTypeForVT[MVT::LAST_VALUETYPE];

	/// This indicates the "representative" register class to use for each
	/// ValueType the target supports natively. This information is used by the
	/// scheduler to track register pressure. By default, the representative
	/// register class is the largest legal super-reg register class of the
	/// register class of the specified type. e.g. On x86, i8, i16, and i32's
	/// representative class would be GR32.
	const TargetRegisterClass *RepRegClassForVT[MVT::LAST_VALUETYPE];

	/// This indicates the "cost" of the "representative" register class for each
	/// ValueType. The cost is used by the scheduler to approximate register
	/// pressure.
	uint8_t RepRegClassCostForVT[MVT::LAST_VALUETYPE];

	/// For any value types we are promoting or expanding, this contains the value
	/// type that we are changing to. For Expanded types, this contains one step
	/// of the expand (e.g. i64 -> i32), even if there are multiple steps required
	/// (e.g. i64 -> i16). For types natively supported by the system, this holds
	/// the same type (e.g. i32 -> i32).
	MVT TransformToType[MVT::LAST_VALUETYPE];

	/// For each operation and each value type, keep a LegalizeAction that
	/// indicates how instruction selection should deal with the operation. Most
	/// operations are Legal (aka, supported natively by the target), but
	/// operations that are not should be described. Note that operations on
	/// non-legal value types are not described here.
	LegalizeAction OpActions[MVT::LAST_VALUETYPE][ISD::BUILTIN_OP_END];

	/// For each load extension type and each value type, keep a LegalizeAction
	/// that indicates how instruction selection should deal with a load of a
	/// specific value type and extension type. Uses 4-bits to store the action
	/// for each of the 4 load ext types.
	uint16_t LoadExtActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];

	/// For each value type pair keep a LegalizeAction that indicates whether a
	/// truncating store of a specific value type and truncating type is legal.
	LegalizeAction TruncStoreActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];

	/// For each indexed mode and each value type, keep a pair of LegalizeAction
	/// that indicates how instruction selection should deal with the load /
	/// store.
	///
	/// The first dimension is the value_type for the reference. The second
	/// dimension represents the various modes for load store.
	uint8_t IndexedModeActions[MVT::LAST_VALUETYPE][ISD::LAST_INDEXED_MODE];

	/// For each condition code (ISD::CondCode) keep a LegalizeAction that
	/// indicates how instruction selection should deal with the condition code.
	///
	/// Because each CC action takes up 4 bits, we need to have the array size be
	/// large enough to fit all of the value types. This can be done by rounding
	/// up the MVT::LAST_VALUETYPE value to the next multiple of 8.
	uint32_t CondCodeActions[ISD::SETCC_INVALID][(MVT::LAST_VALUETYPE + 7) / 8];

	protected:
	ValueTypeActionImpl ValueTypeActions;

	private:
	LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const;

	/// Targets can specify ISD nodes that they would like PerformDAGCombine
	/// callbacks for by calling setTargetDAGCombine(), which sets a bit in this
	/// array.
	unsigned char
	TargetDAGCombineArray[(ISD::BUILTIN_OP_END+CHAR_BIT-1)/CHAR_BIT];

	/// For operations that must be promoted to a specific type, this holds the
	/// destination type. This map should be sparse, so don't hold it as an
	/// array.
	///
	/// Targets add entries to this map with AddPromotedToType(..), clients access
	/// this with getTypeToPromoteTo(..).
	std::map<std::pair<unsigned, MVT::SimpleValueType>, MVT::SimpleValueType>
	PromoteToType;

	/// Stores the name each libcall.
	const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1];

	/// The ISD::CondCode that should be used to test the result of each of the
	/// comparison libcall against zero.
	ISD::CondCode CmpLibcallCCs[RTLIB::UNKNOWN_LIBCALL];

	/// Stores the CallingConv that should be used for each libcall.
	CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL];

	/// Set default libcall names and calling conventions.
	void InitLibcalls(const Triple &TT);

	protected:
	/// Return true if the extension represented by \p I is free.
	/// \pre \p I is a sign, zero, or fp extension and
	/// is[Z\|FP]ExtFree of the related types is not true.
	virtual bool isExtFreeImpl(const Instruction *I) const { return false; }

	/// Depth that GatherAllAliases should should continue looking for chain
	/// dependencies when trying to find a more preferable chain. As an
	/// approximation, this should be more than the number of consecutive stores
	/// expected to be merged.
	unsigned GatherAllAliasesMaxDepth;

	/// \brief Specify maximum number of store instructions per memset call.
	///
	/// When lowering \@llvm.memset this field specifies the maximum number of
	/// store operations that may be substituted for the call to memset. Targets
	/// must set this value based on the cost threshold for that target. Targets
	/// should assume that the memset will be done using as many of the largest
	/// store operations first, followed by smaller ones, if necessary, per
	/// alignment restrictions. For example, storing 9 bytes on a 32-bit machine
	/// with 16-bit alignment would result in four 2-byte stores and one 1-byte
	/// store. This only applies to setting a constant array of a constant size.
	unsigned MaxStoresPerMemset;

	/// Maximum number of stores operations that may be substituted for the call
	/// to memset, used for functions with OptSize attribute.
	unsigned MaxStoresPerMemsetOptSize;

	/// \brief Specify maximum bytes of store instructions per memcpy call.
	///
	/// When lowering \@llvm.memcpy this field specifies the maximum number of
	/// store operations that may be substituted for a call to memcpy. Targets
	/// must set this value based on the cost threshold for that target. Targets
	/// should assume that the memcpy will be done using as many of the largest
	/// store operations first, followed by smaller ones, if necessary, per
	/// alignment restrictions. For example, storing 7 bytes on a 32-bit machine
	/// with 32-bit alignment would result in one 4-byte store, a one 2-byte store
	/// and one 1-byte store. This only applies to copying a constant array of
	/// constant size.
	unsigned MaxStoresPerMemcpy;

	/// Maximum number of store operations that may be substituted for a call to
	/// memcpy, used for functions with OptSize attribute.
	unsigned MaxStoresPerMemcpyOptSize;
	unsigned MaxLoadsPerMemcmp;
	unsigned MaxLoadsPerMemcmpOptSize;

	/// \brief Specify maximum bytes of store instructions per memmove call.
	///
	/// When lowering \@llvm.memmove this field specifies the maximum number of
	/// store instructions that may be substituted for a call to memmove. Targets
	/// must set this value based on the cost threshold for that target. Targets
	/// should assume that the memmove will be done using as many of the largest
	/// store operations first, followed by smaller ones, if necessary, per
	/// alignment restrictions. For example, moving 9 bytes on a 32-bit machine
	/// with 8-bit alignment would result in nine 1-byte stores. This only
	/// applies to copying a constant array of constant size.
	unsigned MaxStoresPerMemmove;

	/// Maximum number of store instructions that may be substituted for a call to
	/// memmove, used for functions with OptSize attribute.
	unsigned MaxStoresPerMemmoveOptSize;

	/// Tells the code generator that select is more expensive than a branch if
	/// the branch is usually predicted right.
	bool PredictableSelectIsExpensive;

	/// \see enableExtLdPromotion.
	bool EnableExtLdPromotion;

	/// Return true if the value types that can be represented by the specified
	/// register class are all legal.
	bool isLegalRC(const TargetRegisterInfo &TRI,
	const TargetRegisterClass &RC) const;

	/// Replace/modify any TargetFrameIndex operands with a targte-dependent
	/// sequence of memory operands that is recognized by PrologEpilogInserter.
	MachineBasicBlock *emitPatchPoint(MachineInstr &MI,
	MachineBasicBlock *MBB) const;
	};

	/// This class defines information used to lower LLVM code to legal SelectionDAG
	/// operators that the target instruction selector can accept natively.
	///
	/// This class also defines callbacks that targets must implement to lower
	/// target-specific constructs to SelectionDAG operators.
	class TargetLowering : public TargetLoweringBase {
	public:
	struct DAGCombinerInfo;

	TargetLowering(const TargetLowering &) = delete;
	TargetLowering &operator=(const TargetLowering &) = delete;

	/// NOTE: The TargetMachine owns TLOF.
	explicit TargetLowering(const TargetMachine &TM);

	bool isPositionIndependent() const;

	/// Returns true by value, base pointer and offset pointer and addressing mode
	/// by reference if the node's address can be legally represented as
	/// pre-indexed load / store address.
	virtual bool getPreIndexedAddressParts(SDNode * /N/, SDValue &/Base/,
	SDValue &/Offset/,
	ISD::MemIndexedMode &/AM/,
	SelectionDAG &/DAG/) const {
	return false;
	}

	/// Returns true by value, base pointer and offset pointer and addressing mode
	/// by reference if this node can be combined with a load / store to form a
	/// post-indexed load / store.
	virtual bool getPostIndexedAddressParts(SDNode * /N/, SDNode * /Op/,
	SDValue &/Base/,
	SDValue &/Offset/,
	ISD::MemIndexedMode &/AM/,
	SelectionDAG &/DAG/) const {
	return false;
	}

	/// Return the entry encoding for a jump table in the current function. The
	/// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
	virtual unsigned getJumpTableEncoding() const;

	virtual const MCExpr *
	LowerCustomJumpTableEntry(const MachineJumpTableInfo * /MJTI/,
	const MachineBasicBlock * /MBB/, unsigned /uid/,
	MCContext &/Ctx/) const {
	llvm_unreachable("Need to implement this hook if target has custom JTIs");
	}

	/// Returns relocation base for the given PIC jumptable.
	virtual SDValue getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const;

	/// This returns the relocation base for the given PIC jumptable, the same as
	/// getPICJumpTableRelocBase, but as an MCExpr.
	virtual const MCExpr *
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
	unsigned JTI, MCContext &Ctx) const;

	/// Return true if folding a constant offset with the given GlobalAddress is
	/// legal. It is frequently not legal in PIC relocation models.
	virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;

	bool isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
	SDValue &Chain) const;

	void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS,
	SDValue &NewRHS, ISD::CondCode &CCCode,
	const SDLoc &DL) const;

	/// Returns a pair of (return value, chain).
	/// It is an error to pass RTLIB::UNKNOWN_LIBCALL as \p LC.
	std::pair<SDValue, SDValue> makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC,
	EVT RetVT, ArrayRef<SDValue> Ops,
	bool isSigned, const SDLoc &dl,
	bool doesNotReturn = false,
	bool isReturnValueUsed = true) const;

	/// Check whether parameters to a call that are passed in callee saved
	/// registers are the same as from the calling function. This needs to be
	/// checked for tail call eligibility.
	bool parametersInCSRMatch(const MachineRegisterInfo &MRI,
	const uint32_t *CallerPreservedMask,
	const SmallVectorImpl<CCValAssign> &ArgLocs,
	const SmallVectorImpl<SDValue> &OutVals) const;

	//===--------------------------------------------------------------------===//
	// TargetLowering Optimization Methods
	//

	/// A convenience struct that encapsulates a DAG, and two SDValues for
	/// returning information from TargetLowering to its clients that want to
	/// combine.
	struct TargetLoweringOpt {
	SelectionDAG &DAG;
	bool LegalTys;
	bool LegalOps;
	SDValue Old;
	SDValue New;

	explicit TargetLoweringOpt(SelectionDAG &InDAG,
	bool LT, bool LO) :
	DAG(InDAG), LegalTys(LT), LegalOps(LO) {}

	bool LegalTypes() const { return LegalTys; }
	bool LegalOperations() const { return LegalOps; }

	bool CombineTo(SDValue O, SDValue N) {
	Old = O;
	New = N;
	return true;
	}
	};

	/// Check to see if the specified operand of the specified instruction is a
	/// constant integer. If so, check to see if there are any bits set in the
	/// constant that are not demanded. If so, shrink the constant and return
	/// true.
	bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
	TargetLoweringOpt &TLO) const;

	// Target hook to do target-specific const optimization, which is called by
	// ShrinkDemandedConstant. This function should return true if the target
	// doesn't want ShrinkDemandedConstant to further optimize the constant.
	virtual bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
	TargetLoweringOpt &TLO) const {
	return false;
	}

	/// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free. This
	/// uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
	/// generalized for targets with other types of implicit widening casts.
	bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded,
	TargetLoweringOpt &TLO) const;

	/// Helper for SimplifyDemandedBits that can simplify an operation with
	/// multiple uses. This function simplifies operand \p OpIdx of \p User and
	/// then updates \p User with the simplified version. No other uses of
	/// \p OpIdx are updated. If \p User is the only user of \p OpIdx, this
	/// function behaves exactly like function SimplifyDemandedBits declared
	/// below except that it also updates the DAG by calling
	/// DCI.CommitTargetLoweringOpt.
	bool SimplifyDemandedBits(SDNode *User, unsigned OpIdx, const APInt &Demanded,
	DAGCombinerInfo &DCI, TargetLoweringOpt &TLO) const;

	/// Look at Op. At this point, we know that only the DemandedMask bits of the
	/// result of Op are ever used downstream. If we can use this information to
	/// simplify Op, create a new simplified DAG node and return true, returning
	/// the original and new nodes in Old and New. Otherwise, analyze the
	/// expression and return a mask of KnownOne and KnownZero bits for the
	/// expression (used to simplify the caller). The KnownZero/One bits may only
	/// be accurate for those bits in the DemandedMask.
	/// \p AssumeSingleUse When this parameter is true, this function will
	/// attempt to simplify \p Op even if there are multiple uses.
	/// Callers are responsible for correctly updating the DAG based on the
	/// results of this function, because simply replacing replacing TLO.Old
	/// with TLO.New will be incorrect when this parameter is true and TLO.Old
	/// has multiple uses.
	bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
	KnownBits &Known,
	TargetLoweringOpt &TLO,
	unsigned Depth = 0,
	bool AssumeSingleUse = false) const;

	/// Helper wrapper around SimplifyDemandedBits
	bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
	DAGCombinerInfo &DCI) const;

	/// Determine which of the bits specified in Mask are known to be either zero
	/// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts
	/// argument allows us to only collect the known bits that are shared by the
	/// requested vector elements.
	virtual void computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const;

	/// Determine which of the bits of FrameIndex \p FIOp are known to be 0.
	/// Default implementation computes low bits based on alignment
	/// information. This should preserve known bits passed into it.
	virtual void computeKnownBitsForFrameIndex(const SDValue FIOp,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const;

	/// This method can be implemented by targets that want to expose additional
	/// information about sign bits to the DAG Combiner. The DemandedElts
	/// argument allows us to only collect the minimum sign bits that are shared
	/// by the requested vector elements.
	virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const;

	struct DAGCombinerInfo {
	void *DC; // The DAG Combiner object.
	CombineLevel Level;
	bool CalledByLegalizer;

	public:
	SelectionDAG &DAG;

	DAGCombinerInfo(SelectionDAG &dag, CombineLevel level, bool cl, void *dc)
	: DC(dc), Level(level), CalledByLegalizer(cl), DAG(dag) {}

	bool isBeforeLegalize() const { return Level == BeforeLegalizeTypes; }
	bool isBeforeLegalizeOps() const { return Level < AfterLegalizeVectorOps; }
	bool isAfterLegalizeVectorOps() const {
	return Level == AfterLegalizeDAG;
	}
	CombineLevel getDAGCombineLevel() { return Level; }
	bool isCalledByLegalizer() const { return CalledByLegalizer; }

	void AddToWorklist(SDNode *N);
	SDValue CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo = true);
	SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true);
	SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true);

	void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO);
	};

	/// Return if the N is a constant or constant vector equal to the true value
	/// from getBooleanContents().
	bool isConstTrueVal(const SDNode *N) const;

	/// Return if the N is a constant or constant vector equal to the false value
	/// from getBooleanContents().
	bool isConstFalseVal(const SDNode *N) const;

	/// Return a constant of type VT that contains a true value that respects
	/// getBooleanContents()
	SDValue getConstTrueVal(SelectionDAG &DAG, EVT VT, const SDLoc &DL) const;

	/// Return if \p N is a True value when extended to \p VT.
	bool isExtendedTrueVal(const ConstantSDNode *N, EVT VT, bool Signed) const;

	/// Try to simplify a setcc built with the specified operands and cc. If it is
	/// unable to simplify it, return a null SDValue.
	SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
	bool foldBooleans, DAGCombinerInfo &DCI,
	const SDLoc &dl) const;

	// For targets which wrap address, unwrap for analysis.
	virtual SDValue unwrapAddress(SDValue N) const { return N; }

	/// Returns true (and the GlobalValue and the offset) if the node is a
	/// GlobalAddress + offset.
	virtual bool
	isGAPlusOffset(SDNode N, const GlobalValue &GA, int64_t &Offset) const;

	/// This method will be invoked for all target nodes and for any
	/// target-independent nodes that the target has registered with invoke it
	/// for.
	///
	/// The semantics are as follows:
	/// Return Value:
	/// SDValue.Val == 0 - No change was made
	/// SDValue.Val == N - N was replaced, is dead, and is already handled.
	/// otherwise - N should be replaced by the returned Operand.
	///
	/// In addition, methods provided by DAGCombinerInfo may be used to perform
	/// more complex transformations.
	///
	virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;

	/// Return true if it is profitable to move a following shift through this
	// node, adjusting any immediate operands as necessary to preserve semantics.
	// This transformation may not be desirable if it disrupts a particularly
	// auspicious target-specific tree (e.g. bitfield extraction in AArch64).
	// By default, it returns true.
	virtual bool isDesirableToCommuteWithShift(const SDNode *N) const {
	return true;
	}

	// Return true if it is profitable to combine a BUILD_VECTOR with a stride-pattern
	// to a shuffle and a truncate.
	// Example of such a combine:
	// v4i32 build_vector((extract_elt V, 1),
	// (extract_elt V, 3),
	// (extract_elt V, 5),
	// (extract_elt V, 7))
	// -->
	// v4i32 truncate (bitcast (shuffle<1,u,3,u,5,u,7,u> V, u) to v4i64)
	virtual bool isDesirableToCombineBuildVectorToShuffleTruncate(
	ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
	return false;
	}

	/// Return true if the target has native support for the specified value type
	/// and it is 'desirable' to use the type for the given node type. e.g. On x86
	/// i16 is legal, but undesirable since i16 instruction encodings are longer
	/// and some i16 instructions are slow.
	virtual bool isTypeDesirableForOp(unsigned /Opc/, EVT VT) const {
	// By default, assume all legal types are desirable.
	return isTypeLegal(VT);
	}

	/// Return true if it is profitable for dag combiner to transform a floating
	/// point op of specified opcode to a equivalent op of an integer
	/// type. e.g. f32 load -> i32 load can be profitable on ARM.
	virtual bool isDesirableToTransformToIntegerOp(unsigned /Opc/,
	EVT /VT/) const {
	return false;
	}

	/// This method query the target whether it is beneficial for dag combiner to
	/// promote the specified node. If true, it should return the desired
	/// promotion type by reference.
	virtual bool IsDesirableToPromoteOp(SDValue /Op/, EVT &/PVT/) const {
	return false;
	}

	/// Return true if the target supports swifterror attribute. It optimizes
	/// loads and stores to reading and writing a specific register.
	virtual bool supportSwiftError() const {
	return false;
	}

	/// Return true if the target supports that a subset of CSRs for the given
	/// machine function is handled explicitly via copies.
	virtual bool supportSplitCSR(MachineFunction *MF) const {
	return false;
	}

	/// Perform necessary initialization to handle a subset of CSRs explicitly
	/// via copies. This function is called at the beginning of instruction
	/// selection.
	virtual void initializeSplitCSR(MachineBasicBlock *Entry) const {
	llvm_unreachable("Not Implemented");
	}

	/// Insert explicit copies in entry and exit blocks. We copy a subset of
	/// CSRs to virtual registers in the entry block, and copy them back to
	/// physical registers in the exit blocks. This function is called at the end
	/// of instruction selection.
	virtual void insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	llvm_unreachable("Not Implemented");
	}

	//===--------------------------------------------------------------------===//
	// Lowering methods - These methods must be implemented by targets so that
	// the SelectionDAGBuilder code knows how to lower these.
	//

	/// This hook must be implemented to lower the incoming (formal) arguments,
	/// described by the Ins array, into the specified DAG. The implementation
	/// should fill in the InVals array with legal-type argument values, and
	/// return the resulting token chain value.
	virtual SDValue LowerFormalArguments(
	SDValue /Chain/, CallingConv::ID /CallConv/, bool /isVarArg/,
	const SmallVectorImpl<ISD::InputArg> & /Ins/, const SDLoc & /dl/,
	SelectionDAG & /DAG/, SmallVectorImpl<SDValue> & /InVals/) const {
	llvm_unreachable("Not Implemented");
	}

	/// This structure contains all information that is necessary for lowering
	/// calls. It is passed to TLI::LowerCallTo when the SelectionDAG builder
	/// needs to lower a call, and targets will see this struct in their LowerCall
	/// implementation.
	struct CallLoweringInfo {
	SDValue Chain;
	Type *RetTy = nullptr;
	bool RetSExt : 1;
	bool RetZExt : 1;
	bool IsVarArg : 1;
	bool IsInReg : 1;
	bool DoesNotReturn : 1;
	bool IsReturnValueUsed : 1;
	bool IsConvergent : 1;
	bool IsPatchPoint : 1;

	// IsTailCall should be modified by implementations of
	// TargetLowering::LowerCall that perform tail call conversions.
	bool IsTailCall = false;

	// Is Call lowering done post SelectionDAG type legalization.
	bool IsPostTypeLegalization = false;

	unsigned NumFixedArgs = -1;
	CallingConv::ID CallConv = CallingConv::C;
	SDValue Callee;
	ArgListTy Args;
	SelectionDAG &DAG;
	SDLoc DL;
	ImmutableCallSite CS;
	SmallVector<ISD::OutputArg, 32> Outs;
	SmallVector<SDValue, 32> OutVals;
	SmallVector<ISD::InputArg, 32> Ins;
	SmallVector<SDValue, 4> InVals;

	CallLoweringInfo(SelectionDAG &DAG)
	: RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false),
	DoesNotReturn(false), IsReturnValueUsed(true), IsConvergent(false),
	IsPatchPoint(false), DAG(DAG) {}

	CallLoweringInfo &setDebugLoc(const SDLoc &dl) {
	DL = dl;
	return *this;
	}

	CallLoweringInfo &setChain(SDValue InChain) {
	Chain = InChain;
	return *this;
	}

	// setCallee with target/module-specific attributes
	CallLoweringInfo &setLibCallee(CallingConv::ID CC, Type *ResultType,
	SDValue Target, ArgListTy &&ArgsList) {
	RetTy = ResultType;
	Callee = Target;
	CallConv = CC;
	NumFixedArgs = ArgsList.size();
	Args = std::move(ArgsList);

	DAG.getTargetLoweringInfo().markLibCallAttributes(
	&(DAG.getMachineFunction()), CC, Args);
	return *this;
	}

	CallLoweringInfo &setCallee(CallingConv::ID CC, Type *ResultType,
	SDValue Target, ArgListTy &&ArgsList) {
	RetTy = ResultType;
	Callee = Target;
	CallConv = CC;
	NumFixedArgs = ArgsList.size();
	Args = std::move(ArgsList);
	return *this;
	}

	CallLoweringInfo &setCallee(Type ResultType, FunctionType FTy,
	SDValue Target, ArgListTy &&ArgsList,
	ImmutableCallSite Call) {
	RetTy = ResultType;

	IsInReg = Call.hasRetAttr(Attribute::InReg);
	DoesNotReturn =
	Call.doesNotReturn() \|\|
	(!Call.isInvoke() &&
	isa<UnreachableInst>(Call.getInstruction()->getNextNode()));
	IsVarArg = FTy->isVarArg();
	IsReturnValueUsed = !Call.getInstruction()->use_empty();
	RetSExt = Call.hasRetAttr(Attribute::SExt);
	RetZExt = Call.hasRetAttr(Attribute::ZExt);

	Callee = Target;

	CallConv = Call.getCallingConv();
	NumFixedArgs = FTy->getNumParams();
	Args = std::move(ArgsList);

	CS = Call;

	return *this;
	}

	CallLoweringInfo &setInRegister(bool Value = true) {
	IsInReg = Value;
	return *this;
	}

	CallLoweringInfo &setNoReturn(bool Value = true) {
	DoesNotReturn = Value;
	return *this;
	}

	CallLoweringInfo &setVarArg(bool Value = true) {
	IsVarArg = Value;
	return *this;
	}

	CallLoweringInfo &setTailCall(bool Value = true) {
	IsTailCall = Value;
	return *this;
	}

	CallLoweringInfo &setDiscardResult(bool Value = true) {
	IsReturnValueUsed = !Value;
	return *this;
	}

	CallLoweringInfo &setConvergent(bool Value = true) {
	IsConvergent = Value;
	return *this;
	}

	CallLoweringInfo &setSExtResult(bool Value = true) {
	RetSExt = Value;
	return *this;
	}

	CallLoweringInfo &setZExtResult(bool Value = true) {
	RetZExt = Value;
	return *this;
	}

	CallLoweringInfo &setIsPatchPoint(bool Value = true) {
	IsPatchPoint = Value;
	return *this;
	}

	CallLoweringInfo &setIsPostTypeLegalization(bool Value=true) {
	IsPostTypeLegalization = Value;
	return *this;
	}

	ArgListTy &getArgs() {
	return Args;
	}
	};

	/// This function lowers an abstract call to a function into an actual call.
	/// This returns a pair of operands. The first element is the return value
	/// for the function (if RetTy is not VoidTy). The second element is the
	/// outgoing token chain. It calls LowerCall to do the actual lowering.
	std::pair<SDValue, SDValue> LowerCallTo(CallLoweringInfo &CLI) const;

	/// This hook must be implemented to lower calls into the specified
	/// DAG. The outgoing arguments to the call are described by the Outs array,
	/// and the values to be returned by the call are described by the Ins
	/// array. The implementation should fill in the InVals array with legal-type
	/// return values from the call, and return the resulting token chain value.
	virtual SDValue
	LowerCall(CallLoweringInfo &/CLI/,
	SmallVectorImpl<SDValue> &/InVals/) const {
	llvm_unreachable("Not Implemented");
	}

	/// Target-specific cleanup for formal ByVal parameters.
	virtual void HandleByVal(CCState *, unsigned &, unsigned) const {}

	/// This hook should be implemented to check whether the return values
	/// described by the Outs array can fit into the return registers. If false
	/// is returned, an sret-demotion is performed.
	virtual bool CanLowerReturn(CallingConv::ID /CallConv/,
	MachineFunction &/MF/, bool /isVarArg/,
	const SmallVectorImpl<ISD::OutputArg> &/Outs/,
	LLVMContext &/Context/) const
	{
	// Return true by default to get preexisting behavior.
	return true;
	}

	/// This hook must be implemented to lower outgoing return values, described
	/// by the Outs array, into the specified DAG. The implementation should
	/// return the resulting token chain value.
	virtual SDValue LowerReturn(SDValue /Chain/, CallingConv::ID /CallConv/,
	bool /isVarArg/,
	const SmallVectorImpl<ISD::OutputArg> & /Outs/,
	const SmallVectorImpl<SDValue> & /OutVals/,
	const SDLoc & /dl/,
	SelectionDAG & /DAG/) const {
	llvm_unreachable("Not Implemented");
	}

	/// Return true if result of the specified node is used by a return node
	/// only. It also compute and return the input chain for the tail call.
	///
	/// This is used to determine whether it is possible to codegen a libcall as
	/// tail call at legalization time.
	virtual bool isUsedByReturnOnly(SDNode , SDValue &/Chain*/) const {
	return false;
	}

	/// Return true if the target may be able emit the call instruction as a tail
	/// call. This is used by optimization passes to determine if it's profitable
	/// to duplicate return instructions to enable tailcall optimization.
	virtual bool mayBeEmittedAsTailCall(const CallInst *) const {
	return false;
	}

	/// Return the builtin name for the __builtin___clear_cache intrinsic
	/// Default is to invoke the clear cache library call
	virtual const char * getClearCacheBuiltinName() const {
	return "__clear_cache";
	}

	/// Return the register ID of the name passed in. Used by named register
	/// global variables extension. There is no target-independent behaviour
	/// so the default action is to bail.
	virtual unsigned getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	report_fatal_error("Named registers not implemented for this target");
	}

	/// Return the type that should be used to zero or sign extend a
	/// zeroext/signext integer return value. FIXME: Some C calling conventions
	/// require the return type to be promoted, but this is not true all the time,
	/// e.g. i1/i8/i16 on x86/x86_64. It is also not necessary for non-C calling
	/// conventions. The frontend should handle this and include all of the
	/// necessary information.
	virtual EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType /ExtendKind/) const {
	EVT MinVT = getRegisterType(Context, MVT::i32);
	return VT.bitsLT(MinVT) ? MinVT : VT;
	}

	/// For some targets, an LLVM struct type must be broken down into multiple
	/// simple types, but the calling convention specifies that the entire struct
	/// must be passed in a block of consecutive registers.
	virtual bool
	functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv,
	bool isVarArg) const {
	return false;
	}

	/// Returns a 0 terminated array of registers that can be safely used as
	/// scratch registers.
	virtual const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const {
	return nullptr;
	}

	/// This callback is used to prepare for a volatile or atomic load.
	/// It takes a chain node as input and returns the chain for the load itself.
	///
	/// Having a callback like this is necessary for targets like SystemZ,
	/// which allows a CPU to reuse the result of a previous load indefinitely,
	/// even if a cache-coherent store is performed by another CPU. The default
	/// implementation does nothing.
	virtual SDValue prepareVolatileOrAtomicLoad(SDValue Chain, const SDLoc &DL,
	SelectionDAG &DAG) const {
	return Chain;
	}

	/// This callback is used to inspect load/store instructions and add
	/// target-specific MachineMemOperand flags to them. The default
	/// implementation does nothing.
	virtual MachineMemOperand::Flags getMMOFlags(const Instruction &I) const {
	return MachineMemOperand::MONone;
	}

	/// This callback is invoked by the type legalizer to legalize nodes with an
	/// illegal operand type but legal result types. It replaces the
	/// LowerOperation callback in the type Legalizer. The reason we can not do
	/// away with LowerOperation entirely is that LegalizeDAG isn't yet ready to
	/// use this callback.
	///
	/// TODO: Consider merging with ReplaceNodeResults.
	///
	/// The target places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	/// The default implementation calls LowerOperation.
	virtual void LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const;

	/// This callback is invoked for operations that are unsupported by the
	/// target, which are registered to use 'custom' lowering, and whose defined
	/// values are all legal. If the target has no operations that require custom
	/// lowering, it need not implement this. The default implementation of this
	/// aborts.
	virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;

	/// This callback is invoked when a node result type is illegal for the
	/// target, and the operation was registered to use 'custom' lowering for that
	/// result type. The target places new result values for the node in Results
	/// (their number and types must exactly match those of the original return
	/// values of the node), or leaves Results empty, which indicates that the
	/// node is not to be custom lowered after all.
	///
	/// If the target has no operations that require custom lowering, it need not
	/// implement this. The default implementation aborts.
	virtual void ReplaceNodeResults(SDNode * /N/,
	SmallVectorImpl<SDValue> &/Results/,
	SelectionDAG &/DAG/) const {
	llvm_unreachable("ReplaceNodeResults not implemented for this target!");
	}

	/// This method returns the name of a target specific DAG node.
	virtual const char *getTargetNodeName(unsigned Opcode) const;

	/// This method returns a target specific FastISel object, or null if the
	/// target does not support "fast" ISel.
	virtual FastISel *createFastISel(FunctionLoweringInfo &,
	const TargetLibraryInfo *) const {
	return nullptr;
	}

	bool verifyReturnAddressArgumentIsConstant(SDValue Op,
	SelectionDAG &DAG) const;

	//===--------------------------------------------------------------------===//
	// Inline Asm Support hooks
	//

	/// This hook allows the target to expand an inline asm call to be explicit
	/// llvm code if it wants to. This is useful for turning simple inline asms
	/// into LLVM intrinsics, which gives the compiler more information about the
	/// behavior of the code.
	virtual bool ExpandInlineAsm(CallInst *) const {
	return false;
	}

	enum ConstraintType {
	C_Register, // Constraint represents specific register(s).
	C_RegisterClass, // Constraint represents any of register(s) in class.
	C_Memory, // Memory constraint.
	C_Other, // Something else.
	C_Unknown // Unsupported constraint.
	};

	enum ConstraintWeight {
	// Generic weights.
	CW_Invalid = -1, // No match.
	CW_Okay = 0, // Acceptable.
	CW_Good = 1, // Good weight.
	CW_Better = 2, // Better weight.
	CW_Best = 3, // Best weight.

	// Well-known weights.
	CW_SpecificReg = CW_Okay, // Specific register operands.
	CW_Register = CW_Good, // Register operands.
	CW_Memory = CW_Better, // Memory operands.
	CW_Constant = CW_Best, // Constant operand.
	CW_Default = CW_Okay // Default or don't know type.
	};

	/// This contains information for each constraint that we are lowering.
	struct AsmOperandInfo : public InlineAsm::ConstraintInfo {
	/// This contains the actual string for the code, like "m". TargetLowering
	/// picks the 'best' code from ConstraintInfo::Codes that most closely
	/// matches the operand.
	std::string ConstraintCode;

	/// Information about the constraint code, e.g. Register, RegisterClass,
	/// Memory, Other, Unknown.
	TargetLowering::ConstraintType ConstraintType = TargetLowering::C_Unknown;

	/// If this is the result output operand or a clobber, this is null,
	/// otherwise it is the incoming operand to the CallInst. This gets
	/// modified as the asm is processed.
	Value *CallOperandVal = nullptr;

	/// The ValueType for the operand value.
	MVT ConstraintVT = MVT::Other;

	/// Copy constructor for copying from a ConstraintInfo.
	AsmOperandInfo(InlineAsm::ConstraintInfo Info)
	: InlineAsm::ConstraintInfo(std::move(Info)) {}

	/// Return true of this is an input operand that is a matching constraint
	/// like "4".
	bool isMatchingInputConstraint() const;

	/// If this is an input matching constraint, this method returns the output
	/// operand it matches.
	unsigned getMatchedOperand() const;
	};

	using AsmOperandInfoVector = std::vector<AsmOperandInfo>;

	/// Split up the constraint string from the inline assembly value into the
	/// specific constraints and their prefixes, and also tie in the associated
	/// operand values. If this returns an empty vector, and if the constraint
	/// string itself isn't empty, there was an error parsing.
	virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL,
	const TargetRegisterInfo *TRI,
	ImmutableCallSite CS) const;

	/// Examine constraint type and operand type and determine a weight value.
	/// The operand object must already have been set up with the operand type.
	virtual ConstraintWeight getMultipleConstraintMatchWeight(
	AsmOperandInfo &info, int maIndex) const;

	/// Examine constraint string and operand type and determine a weight value.
	/// The operand object must already have been set up with the operand type.
	virtual ConstraintWeight getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const;

	/// Determines the constraint code and constraint type to use for the specific
	/// AsmOperandInfo, setting OpInfo.ConstraintCode and OpInfo.ConstraintType.
	/// If the actual operand being passed in is available, it can be passed in as
	/// Op, otherwise an empty SDValue can be passed.
	virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo,
	SDValue Op,
	SelectionDAG *DAG = nullptr) const;

	/// Given a constraint, return the type of constraint it is for this target.
	virtual ConstraintType getConstraintType(StringRef Constraint) const;

	/// Given a physical register constraint (e.g. {edx}), return the register
	/// number and the register class for the register.
	///
	/// Given a register class constraint, like 'r', if this corresponds directly
	/// to an LLVM register class, return a register of 0 and the register class
	/// pointer.
	///
	/// This should only be used for C_Register constraints. On error, this
	/// returns a register number of 0 and a null register class pointer.
	virtual std::pair<unsigned, const TargetRegisterClass *>
	getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint, MVT VT) const;

	virtual unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const {
	if (ConstraintCode == "i")
	return InlineAsm::Constraint_i;
	else if (ConstraintCode == "m")
	return InlineAsm::Constraint_m;
	return InlineAsm::Constraint_Unknown;
	}

	/// Try to replace an X constraint, which matches anything, with another that
	/// has more specific requirements based on the type of the corresponding
	/// operand. This returns null if there is no replacement to make.
	virtual const char *LowerXConstraint(EVT ConstraintVT) const;

	/// Lower the specified operand into the Ops vector. If it is invalid, don't
	/// add anything to Ops.
	virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const;

	//===--------------------------------------------------------------------===//
	// Div utility functions
	//
	SDValue BuildSDIV(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
	bool IsAfterLegalization,
	std::vector<SDNode > Created) const;
	SDValue BuildUDIV(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
	bool IsAfterLegalization,
	std::vector<SDNode > Created) const;

	/// Targets may override this function to provide custom SDIV lowering for
	/// power-of-2 denominators. If the target returns an empty SDValue, LLVM
	/// assumes SDIV is expensive and replaces it with a series of other integer
	/// operations.
	virtual SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	std::vector<SDNode > Created) const;

	/// Indicate whether this target prefers to combine FDIVs with the same
	/// divisor. If the transform should never be done, return zero. If the
	/// transform should be done, return the minimum number of divisor uses
	/// that must exist.
	virtual unsigned combineRepeatedFPDivisors() const {
	return 0;
	}

	/// Hooks for building estimates in place of slower divisions and square
	/// roots.

	/// Return either a square root or its reciprocal estimate value for the input
	/// operand.
	/// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or
	/// 'Enabled' as set by a potential default override attribute.
	/// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson
	/// refinement iterations required to generate a sufficient (though not
	/// necessarily IEEE-754 compliant) estimate is returned in that parameter.
	/// The boolean UseOneConstNR output is used to select a Newton-Raphson
	/// algorithm implementation that uses either one or two constants.
	/// The boolean Reciprocal is used to select whether the estimate is for the
	/// square root of the input operand or the reciprocal of its square root.
	/// A target may choose to implement its own refinement within this function.
	/// If that's true, then return '0' as the number of RefinementSteps to avoid
	/// any further refinement of the estimate.
	/// An empty SDValue return means no estimate sequence can be created.
	virtual SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
	int Enabled, int &RefinementSteps,
	bool &UseOneConstNR, bool Reciprocal) const {
	return SDValue();
	}

	/// Return a reciprocal estimate value for the input operand.
	/// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or
	/// 'Enabled' as set by a potential default override attribute.
	/// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson
	/// refinement iterations required to generate a sufficient (though not
	/// necessarily IEEE-754 compliant) estimate is returned in that parameter.
	/// A target may choose to implement its own refinement within this function.
	/// If that's true, then return '0' as the number of RefinementSteps to avoid
	/// any further refinement of the estimate.
	/// An empty SDValue return means no estimate sequence can be created.
	virtual SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
	int Enabled, int &RefinementSteps) const {
	return SDValue();
	}

	//===--------------------------------------------------------------------===//
	// Legalization utility functions
	//

	/// Expand a MUL or [US]MUL_LOHI of n-bit values into two or four nodes,
	/// respectively, each computing an n/2-bit part of the result.
	/// \param Result A vector that will be filled with the parts of the result
	/// in little-endian order.
	/// \param LL Low bits of the LHS of the MUL. You can use this parameter
	/// if you want to control how low bits are extracted from the LHS.
	/// \param LH High bits of the LHS of the MUL. See LL for meaning.
	/// \param RL Low bits of the RHS of the MUL. See LL for meaning
	/// \param RH High bits of the RHS of the MUL. See LL for meaning.
	/// \returns true if the node has been expanded, false if it has not
	bool expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl, SDValue LHS,
	SDValue RHS, SmallVectorImpl<SDValue> &Result, EVT HiLoVT,
	SelectionDAG &DAG, MulExpansionKind Kind,
	SDValue LL = SDValue(), SDValue LH = SDValue(),
	SDValue RL = SDValue(), SDValue RH = SDValue()) const;

	/// Expand a MUL into two nodes. One that computes the high bits of
	/// the result and one that computes the low bits.
	/// \param HiLoVT The value type to use for the Lo and Hi nodes.
	/// \param LL Low bits of the LHS of the MUL. You can use this parameter
	/// if you want to control how low bits are extracted from the LHS.
	/// \param LH High bits of the LHS of the MUL. See LL for meaning.
	/// \param RL Low bits of the RHS of the MUL. See LL for meaning
	/// \param RH High bits of the RHS of the MUL. See LL for meaning.
	/// \returns true if the node has been expanded. false if it has not
	bool expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
	SelectionDAG &DAG, MulExpansionKind Kind,
	SDValue LL = SDValue(), SDValue LH = SDValue(),
	SDValue RL = SDValue(), SDValue RH = SDValue()) const;

	/// Expand float(f32) to SINT(i64) conversion
	/// \param N Node to expand
	/// \param Result output after conversion
	/// \returns True, if the expansion was successful, false otherwise
	bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;

	/// Turn load of vector type into a load of the individual elements.
	/// \param LD load to expand
	/// \returns MERGE_VALUEs of the scalar loads with their chains.
	SDValue scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const;

	// Turn a store of a vector type into stores of the individual elements.
	/// \param ST Store with a vector value type
	/// \returns MERGE_VALUs of the individual store chains.
	SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const;

	/// Expands an unaligned load to 2 half-size loads for an integer, and
	/// possibly more for vectors.
	std::pair<SDValue, SDValue> expandUnalignedLoad(LoadSDNode *LD,
	SelectionDAG &DAG) const;

	/// Expands an unaligned store to 2 half-size stores for integer values, and
	/// possibly more for vectors.
	SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const;

	/// Increments memory address \p Addr according to the type of the value
	/// \p DataVT that should be stored. If the data is stored in compressed
	/// form, the memory address should be incremented according to the number of
	/// the stored elements. This number is equal to the number of '1's bits
	/// in the \p Mask.
	/// \p DataVT is a vector type. \p Mask is a vector value.
	/// \p DataVT and \p Mask have the same number of vector elements.
	SDValue IncrementMemoryAddress(SDValue Addr, SDValue Mask, const SDLoc &DL,
	EVT DataVT, SelectionDAG &DAG,
	bool IsCompressedMemory) const;

	/// Get a pointer to vector element \p Idx located in memory for a vector of
	/// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of
	/// bounds the returned pointer is unspecified, but will be within the vector
	/// bounds.
	SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
	SDValue Idx) const;

	//===--------------------------------------------------------------------===//
	// Instruction Emitting Hooks
	//

	/// This method should be implemented by targets that mark instructions with
	/// the 'usesCustomInserter' flag. These instructions are special in various
	/// ways, which require special support to insert. The specified MachineInstr
	/// is created but not inserted into any basic blocks, and this method is
	/// called to expand it into a sequence of instructions, potentially also
	/// creating new basic blocks and control flow.
	/// As long as the returned basic block is different (i.e., we created a new
	/// one), the custom inserter is free to modify the rest of \p MBB.
	virtual MachineBasicBlock *
	EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;

	/// This method should be implemented by targets that mark instructions with
	/// the 'hasPostISelHook' flag. These instructions must be adjusted after
	/// instruction selection by target hooks. e.g. To fill in optional defs for
	/// ARM 's' setting instructions.
	virtual void AdjustInstrPostInstrSelection(MachineInstr &MI,
	SDNode *Node) const;

	/// If this function returns true, SelectionDAGBuilder emits a
	/// LOAD_STACK_GUARD node when it is lowering Intrinsic::stackprotector.
	virtual bool useLoadStackGuardNode() const {
	return false;
	}

	virtual SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
	const SDLoc &DL) const {
	llvm_unreachable("not implemented for this target");
	}

	/// Lower TLS global address SDNode for target independent emulated TLS model.
	virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
	SelectionDAG &DAG) const;

	// seteq(x, 0) -> truncate(srl(ctlz(zext(x)), log2(#bits)))
	// If we're comparing for equality to zero and isCtlzFast is true, expose the
	// fact that this can be implemented as a ctlz/srl pair, so that the dag
	// combiner can fold the new nodes.
	SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) const;

	private:
	SDValue simplifySetCCWithAnd(EVT VT, SDValue N0, SDValue N1,
	ISD::CondCode Cond, DAGCombinerInfo &DCI,
	const SDLoc &DL) const;
	};

	/// Given an LLVM IR type and return type attributes, compute the return value
	/// EVTs and flags, and optionally also the offsets, if the return value is
	/// being lowered to memory.
	void GetReturnInfo(Type *ReturnType, AttributeList attr,
	SmallVectorImpl<ISD::OutputArg> &Outs,
	const TargetLowering &TLI, const DataLayout &DL);

	} // end namespace llvm

	#endif // LLVM_CODEGEN_TARGETLOWERING_H
	Index: head/contrib/llvm/include/llvm/CodeGen/TargetPassConfig.h
	===================================================================
	--- head/contrib/llvm/include/llvm/CodeGen/TargetPassConfig.h (revision 328816)
	+++ head/contrib/llvm/include/llvm/CodeGen/TargetPassConfig.h (revision 328817)
	@@ -1,447 +1,454 @@
	//===- TargetPassConfig.h - Code Generation pass options --------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	/// Target-Independent Code Generator Pass Configuration Options pass.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CODEGEN_TARGETPASSCONFIG_H
	#define LLVM_CODEGEN_TARGETPASSCONFIG_H

	#include "llvm/Pass.h"
	#include "llvm/Support/CodeGen.h"
	#include <cassert>
	#include <string>

	namespace llvm {

	class LLVMTargetMachine;
	struct MachineSchedContext;
	class PassConfigImpl;
	class ScheduleDAGInstrs;

	// The old pass manager infrastructure is hidden in a legacy namespace now.
	namespace legacy {

	class PassManagerBase;

	} // end namespace legacy

	using legacy::PassManagerBase;

	/// Discriminated union of Pass ID types.
	///
	/// The PassConfig API prefers dealing with IDs because they are safer and more
	/// efficient. IDs decouple configuration from instantiation. This way, when a
	/// pass is overriden, it isn't unnecessarily instantiated. It is also unsafe to
	/// refer to a Pass pointer after adding it to a pass manager, which deletes
	/// redundant pass instances.
	///
	/// However, it is convient to directly instantiate target passes with
	/// non-default ctors. These often don't have a registered PassInfo. Rather than
	/// force all target passes to implement the pass registry boilerplate, allow
	/// the PassConfig API to handle either type.
	///
	/// AnalysisID is sadly char*, so PointerIntPair won't work.
	class IdentifyingPassPtr {
	union {
	AnalysisID ID;
	Pass *P;
	};
	bool IsInstance = false;

	public:
	IdentifyingPassPtr() : P(nullptr) {}
	IdentifyingPassPtr(AnalysisID IDPtr) : ID(IDPtr) {}
	IdentifyingPassPtr(Pass *InstancePtr) : P(InstancePtr), IsInstance(true) {}

	bool isValid() const { return P; }
	bool isInstance() const { return IsInstance; }

	AnalysisID getID() const {
	assert(!IsInstance && "Not a Pass ID");
	return ID;
	}

	Pass *getInstance() const {
	assert(IsInstance && "Not a Pass Instance");
	return P;
	}
	};

	template <> struct isPodLike<IdentifyingPassPtr> {
	static const bool value = true;
	};

	/// Target-Independent Code Generator Pass Configuration Options.
	///
	/// This is an ImmutablePass solely for the purpose of exposing CodeGen options
	/// to the internals of other CodeGen passes.
	class TargetPassConfig : public ImmutablePass {
	public:
	/// Pseudo Pass IDs. These are defined within TargetPassConfig because they
	/// are unregistered pass IDs. They are only useful for use with
	/// TargetPassConfig APIs to identify multiple occurrences of the same pass.
	///

	/// EarlyTailDuplicate - A clone of the TailDuplicate pass that runs early
	/// during codegen, on SSA form.
	static char EarlyTailDuplicateID;

	/// PostRAMachineLICM - A clone of the LICM pass that runs during late machine
	/// optimization after regalloc.
	static char PostRAMachineLICMID;

	private:
	PassManagerBase *PM = nullptr;
	AnalysisID StartBefore = nullptr;
	AnalysisID StartAfter = nullptr;
	AnalysisID StopBefore = nullptr;
	AnalysisID StopAfter = nullptr;
	bool Started = true;
	bool Stopped = false;
	bool AddingMachinePasses = false;

	/// Set the StartAfter, StartBefore and StopAfter passes to allow running only
	/// a portion of the normal code-gen pass sequence.
	///
	/// If the StartAfter and StartBefore pass ID is zero, then compilation will
	/// begin at the normal point; otherwise, clear the Started flag to indicate
	/// that passes should not be added until the starting pass is seen. If the
	/// Stop pass ID is zero, then compilation will continue to the end.
	///
	/// This function expects that at least one of the StartAfter or the
	/// StartBefore pass IDs is null.
	void setStartStopPasses();

	protected:
	LLVMTargetMachine *TM;
	PassConfigImpl *Impl = nullptr; // Internal data structures
	bool Initialized = false; // Flagged after all passes are configured.

	// Target Pass Options
	// Targets provide a default setting, user flags override.
	bool DisableVerify = false;

	/// Default setting for -enable-tail-merge on this target.
	bool EnableTailMerge = true;

	/// Require processing of functions such that callees are generated before
	/// callers.
	bool RequireCodeGenSCCOrder = false;

	/// Add the actual instruction selection passes. This does not include
	/// preparation passes on IR.
	bool addCoreISelPasses();

	public:
	TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm);
	// Dummy constructor.
	TargetPassConfig();

	~TargetPassConfig() override;

	static char ID;

	/// Get the right type of TargetMachine for this target.
	template<typename TMC> TMC &getTM() const {
	return static_cast<TMC>(TM);
	}

	//
	void setInitialized() { Initialized = true; }

	CodeGenOpt::Level getOptLevel() const;

	/// Describe the status of the codegen
	/// pipeline set by this target pass config.
	/// Having a limited codegen pipeline means that options
	/// have been used to restrict what codegen is doing.
	/// In particular, that means that codegen won't emit
	/// assembly code.
	bool hasLimitedCodeGenPipeline() const;

	/// If hasLimitedCodeGenPipeline is true, this method
	/// returns a string with the name of the options, separated
	/// by \p Separator that caused this pipeline to be limited.
	std::string
	getLimitedCodeGenPipelineReason(const char *Separator = "/") const;

	/// Check if the codegen pipeline is limited in such a way that it
	/// won't be complete. When the codegen pipeline is not complete,
	/// this means it may not be possible to generate assembly from it.
	bool willCompleteCodeGenPipeline() const {
	return !hasLimitedCodeGenPipeline() \|\| (!StopAfter && !StopBefore);
	}

	void setDisableVerify(bool Disable) { setOpt(DisableVerify, Disable); }

	bool getEnableTailMerge() const { return EnableTailMerge; }
	void setEnableTailMerge(bool Enable) { setOpt(EnableTailMerge, Enable); }

	bool requiresCodeGenSCCOrder() const { return RequireCodeGenSCCOrder; }
	void setRequiresCodeGenSCCOrder(bool Enable = true) {
	setOpt(RequireCodeGenSCCOrder, Enable);
	}

	/// Allow the target to override a specific pass without overriding the pass
	/// pipeline. When passes are added to the standard pipeline at the
	/// point where StandardID is expected, add TargetID in its place.
	void substitutePass(AnalysisID StandardID, IdentifyingPassPtr TargetID);

	/// Insert InsertedPassID pass after TargetPassID pass.
	void insertPass(AnalysisID TargetPassID, IdentifyingPassPtr InsertedPassID,
	bool VerifyAfter = true, bool PrintAfter = true);

	/// Allow the target to enable a specific standard pass by default.
	void enablePass(AnalysisID PassID) { substitutePass(PassID, PassID); }

	/// Allow the target to disable a specific standard pass by default.
	void disablePass(AnalysisID PassID) {
	substitutePass(PassID, IdentifyingPassPtr());
	}

	/// Return the pass substituted for StandardID by the target.
	/// If no substitution exists, return StandardID.
	IdentifyingPassPtr getPassSubstitution(AnalysisID StandardID) const;

	/// Return true if the pass has been substituted by the target or
	/// overridden on the command line.
	bool isPassSubstitutedOrOverridden(AnalysisID ID) const;

	/// Return true if the optimized regalloc pipeline is enabled.
	bool getOptimizeRegAlloc() const;

	/// Return true if shrink wrapping is enabled.
	bool getEnableShrinkWrap() const;

	/// Return true if the default global register allocator is in use and
	/// has not be overriden on the command line with '-regalloc=...'
	bool usingDefaultRegAlloc() const;

	/// High level function that adds all passes necessary to go from llvm IR
	/// representation to the MI representation.
	/// Adds IR based lowering and target specific optimization passes and finally
	/// the core instruction selection passes.
	/// \returns true if an error occured, false otherwise.
	bool addISelPasses();

	/// Add common target configurable passes that perform LLVM IR to IR
	/// transforms following machine independent optimization.
	virtual void addIRPasses();

	/// Add passes to lower exception handling for the code generator.
	void addPassesToHandleExceptions();

	/// Add pass to prepare the LLVM IR for code generation. This should be done
	/// before exception handling preparation passes.
	virtual void addCodeGenPrepare();

	/// Add common passes that perform LLVM IR to IR transforms in preparation for
	/// instruction selection.
	virtual void addISelPrepare();

	/// addInstSelector - This method should install an instruction selector pass,
	/// which converts from LLVM code to machine instructions.
	virtual bool addInstSelector() {
	return true;
	}

	/// This method should install an IR translator pass, which converts from
	/// LLVM code to machine instructions with possibly generic opcodes.
	virtual bool addIRTranslator() { return true; }

	/// This method may be implemented by targets that want to run passes
	/// immediately before legalization.
	virtual void addPreLegalizeMachineIR() {}

	/// This method should install a legalize pass, which converts the instruction
	/// sequence into one that can be selected by the target.
	virtual bool addLegalizeMachineIR() { return true; }

	/// This method may be implemented by targets that want to run passes
	/// immediately before the register bank selection.
	virtual void addPreRegBankSelect() {}

	/// This method should install a register bank selector pass, which
	/// assigns register banks to virtual registers without a register
	/// class or register banks.
	virtual bool addRegBankSelect() { return true; }

	/// This method may be implemented by targets that want to run passes
	/// immediately before the (global) instruction selection.
	virtual void addPreGlobalInstructionSelect() {}

	/// This method should install a (global) instruction selector pass, which
	/// converts possibly generic instructions to fully target-specific
	/// instructions, thereby constraining all generic virtual registers to
	/// register classes.
	virtual bool addGlobalInstructionSelect() { return true; }

	/// Add the complete, standard set of LLVM CodeGen passes.
	/// Fully developed targets will not generally override this.
	virtual void addMachinePasses();

	/// Create an instance of ScheduleDAGInstrs to be run within the standard
	/// MachineScheduler pass for this function and target at the current
	/// optimization level.
	///
	/// This can also be used to plug a new MachineSchedStrategy into an instance
	/// of the standard ScheduleDAGMI:
	/// return new ScheduleDAGMI(C, make_unique<MyStrategy>(C), /RemoveKillFlags=/false)
	///
	/// Return NULL to select the default (generic) machine scheduler.
	virtual ScheduleDAGInstrs *
	createMachineScheduler(MachineSchedContext *C) const {
	return nullptr;
	}

	/// Similar to createMachineScheduler but used when postRA machine scheduling
	/// is enabled.
	virtual ScheduleDAGInstrs *
	createPostMachineScheduler(MachineSchedContext *C) const {
	return nullptr;
	}

	/// printAndVerify - Add a pass to dump then verify the machine function, if
	/// those steps are enabled.
	void printAndVerify(const std::string &Banner);

	/// Add a pass to print the machine function if printing is enabled.
	void addPrintPass(const std::string &Banner);

	/// Add a pass to perform basic verification of the machine function if
	/// verification is enabled.
	void addVerifyPass(const std::string &Banner);

	/// Check whether or not GlobalISel should be enabled by default.
	/// Fallback/abort behavior is controlled via other methods.
	virtual bool isGlobalISelEnabled() const;

	/// Check whether or not GlobalISel should abort on error.
	/// When this is disabled, GlobalISel will fall back on SDISel instead of
	/// erroring out.
	bool isGlobalISelAbortEnabled() const;

	/// Check whether or not a diagnostic should be emitted when GlobalISel
	/// uses the fallback path. In other words, it will emit a diagnostic
	/// when GlobalISel failed and isGlobalISelAbortEnabled is false.
	virtual bool reportDiagnosticWhenGlobalISelFallback() const;

	protected:
	// Helper to verify the analysis is really immutable.
	void setOpt(bool &Opt, bool Val);

	/// Methods with trivial inline returns are convenient points in the common
	/// codegen pass pipeline where targets may insert passes. Methods with
	/// out-of-line standard implementations are major CodeGen stages called by
	/// addMachinePasses. Some targets may override major stages when inserting
	/// passes is insufficient, but maintaining overriden stages is more work.
	///

	/// addPreISelPasses - This method should add any "last minute" LLVM->LLVM
	/// passes (which are run just before instruction selector).
	virtual bool addPreISel() {
	return true;
	}

	/// addMachineSSAOptimization - Add standard passes that optimize machine
	/// instructions in SSA form.
	virtual void addMachineSSAOptimization();

	/// Add passes that optimize instruction level parallelism for out-of-order
	/// targets. These passes are run while the machine code is still in SSA
	/// form, so they can use MachineTraceMetrics to control their heuristics.
	///
	/// All passes added here should preserve the MachineDominatorTree,
	/// MachineLoopInfo, and MachineTraceMetrics analyses.
	virtual bool addILPOpts() {
	return false;
	}

	/// This method may be implemented by targets that want to run passes
	/// immediately before register allocation.
	virtual void addPreRegAlloc() { }

	/// createTargetRegisterAllocator - Create the register allocator pass for
	/// this target at the current optimization level.
	virtual FunctionPass *createTargetRegisterAllocator(bool Optimized);

	/// addFastRegAlloc - Add the minimum set of target-independent passes that
	/// are required for fast register allocation.
	virtual void addFastRegAlloc(FunctionPass *RegAllocPass);

	/// addOptimizedRegAlloc - Add passes related to register allocation.
	/// LLVMTargetMachine provides standard regalloc passes for most targets.
	virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass);

	/// addPreRewrite - Add passes to the optimized register allocation pipeline
	/// after register allocation is complete, but before virtual registers are
	/// rewritten to physical registers.
	///
	/// These passes must preserve VirtRegMap and LiveIntervals, and when running
	/// after RABasic or RAGreedy, they should take advantage of LiveRegMatrix.
	/// When these passes run, VirtRegMap contains legal physreg assignments for
	/// all virtual registers.
	virtual bool addPreRewrite() {
	return false;
	}

	/// This method may be implemented by targets that want to run passes after
	/// register allocation pass pipeline but before prolog-epilog insertion.
	virtual void addPostRegAlloc() { }

	/// Add passes that optimize machine instructions after register allocation.
	virtual void addMachineLateOptimization();

	/// This method may be implemented by targets that want to run passes after
	/// prolog-epilog insertion and before the second instruction scheduling pass.
	virtual void addPreSched2() { }

	/// addGCPasses - Add late codegen passes that analyze code for garbage
	/// collection. This should return true if GC info should be printed after
	/// these passes.
	virtual bool addGCPasses();

	/// Add standard basic block placement passes.
	virtual void addBlockPlacement();

	/// This pass may be implemented by targets that want to run passes
	/// immediately before machine code is emitted.
	virtual void addPreEmitPass() { }

	+ /// Targets may add passes immediately before machine code is emitted in this
	+ /// callback. This is called even later than `addPreEmitPass`.
	+ // FIXME: Rename `addPreEmitPass` to something more sensible given its actual
	+ // position and remove the `2` suffix here as this callback is what
	+ // `addPreEmitPass` should be but in reality isn't.
	+ virtual void addPreEmitPass2() {}
	+
	/// Utilities for targets to add passes to the pass manager.
	///

	/// Add a CodeGen pass at this point in the pipeline after checking overrides.
	/// Return the pass that was added, or zero if no pass was added.
	/// @p printAfter if true and adding a machine function pass add an extra
	/// machine printer pass afterwards
	/// @p verifyAfter if true and adding a machine function pass add an extra
	/// machine verification pass afterwards.
	AnalysisID addPass(AnalysisID PassID, bool verifyAfter = true,
	bool printAfter = true);

	/// Add a pass to the PassManager if that pass is supposed to be run, as
	/// determined by the StartAfter and StopAfter options. Takes ownership of the
	/// pass.
	/// @p printAfter if true and adding a machine function pass add an extra
	/// machine printer pass afterwards
	/// @p verifyAfter if true and adding a machine function pass add an extra
	/// machine verification pass afterwards.
	void addPass(Pass *P, bool verifyAfter = true, bool printAfter = true);

	/// addMachinePasses helper to create the target-selected or overriden
	/// regalloc pass.
	FunctionPass *createRegAllocPass(bool Optimized);
	};

	} // end namespace llvm

	#endif // LLVM_CODEGEN_TARGETPASSCONFIG_H
	Index: head/contrib/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
	===================================================================
	--- head/contrib/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h (revision 328816)
	+++ head/contrib/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h (revision 328817)
	@@ -1,255 +1,258 @@
	//===- llvm/CodeGen/TargetSubtargetInfo.h - Target Information --- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the subtarget options of a Target machine.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CODEGEN_TARGETSUBTARGETINFO_H
	#define LLVM_CODEGEN_TARGETSUBTARGETINFO_H

	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/CodeGen/PBQPRAConstraint.h"
	#include "llvm/CodeGen/ScheduleDAGMutation.h"
	#include "llvm/CodeGen/SchedulerRegistry.h"
	#include "llvm/MC/MCSubtargetInfo.h"
	#include "llvm/Support/CodeGen.h"
	#include <memory>
	#include <vector>


	namespace llvm {

	class CallLowering;
	class InstrItineraryData;
	struct InstrStage;
	class InstructionSelector;
	class LegalizerInfo;
	class MachineInstr;
	struct MachineSchedPolicy;
	struct MCReadAdvanceEntry;
	struct MCWriteLatencyEntry;
	struct MCWriteProcResEntry;
	class RegisterBankInfo;
	class SDep;
	class SelectionDAGTargetInfo;
	struct SubtargetFeatureKV;
	struct SubtargetInfoKV;
	class SUnit;
	class TargetFrameLowering;
	class TargetInstrInfo;
	class TargetLowering;
	class TargetRegisterClass;
	class TargetRegisterInfo;
	class TargetSchedModel;
	class Triple;

	//===----------------------------------------------------------------------===//
	///
	/// TargetSubtargetInfo - Generic base class for all target subtargets. All
	/// Target-specific options that control code generation and printing should
	/// be exposed through a TargetSubtargetInfo-derived class.
	///
	class TargetSubtargetInfo : public MCSubtargetInfo {
	protected: // Can only create subclasses...
	TargetSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS,
	ArrayRef<SubtargetFeatureKV> PF,
	ArrayRef<SubtargetFeatureKV> PD,
	const SubtargetInfoKV *ProcSched,
	const MCWriteProcResEntry *WPR,
	const MCWriteLatencyEntry *WL,
	const MCReadAdvanceEntry RA, const InstrStage IS,
	const unsigned OC, const unsigned FP);

	public:
	// AntiDepBreakMode - Type of anti-dependence breaking that should
	// be performed before post-RA scheduling.
	using AntiDepBreakMode = enum { ANTIDEP_NONE, ANTIDEP_CRITICAL, ANTIDEP_ALL };
	using RegClassVector = SmallVectorImpl<const TargetRegisterClass *>;

	TargetSubtargetInfo() = delete;
	TargetSubtargetInfo(const TargetSubtargetInfo &) = delete;
	TargetSubtargetInfo &operator=(const TargetSubtargetInfo &) = delete;
	~TargetSubtargetInfo() override;

	virtual bool isXRaySupported() const { return false; }

	// Interfaces to the major aspects of target machine information:
	//
	// -- Instruction opcode and operand information
	// -- Pipelines and scheduling information
	// -- Stack frame information
	// -- Selection DAG lowering information
	// -- Call lowering information
	//
	// N.B. These objects may change during compilation. It's not safe to cache
	// them between functions.
	virtual const TargetInstrInfo *getInstrInfo() const { return nullptr; }
	virtual const TargetFrameLowering *getFrameLowering() const {
	return nullptr;
	}
	virtual const TargetLowering *getTargetLowering() const { return nullptr; }
	virtual const SelectionDAGTargetInfo *getSelectionDAGInfo() const {
	return nullptr;
	}
	virtual const CallLowering *getCallLowering() const { return nullptr; }

	// FIXME: This lets targets specialize the selector by subtarget (which lets
	// us do things like a dedicated avx512 selector). However, we might want
	// to also specialize selectors by MachineFunction, which would let us be
	// aware of optsize/optnone and such.
	virtual const InstructionSelector *getInstructionSelector() const {
	return nullptr;
	}

	virtual unsigned getHwMode() const { return 0; }

	/// Target can subclass this hook to select a different DAG scheduler.
	virtual RegisterScheduler::FunctionPassCtor
	getDAGScheduler(CodeGenOpt::Level) const {
	return nullptr;
	}

	virtual const LegalizerInfo *getLegalizerInfo() const { return nullptr; }

	/// getRegisterInfo - If register information is available, return it. If
	/// not, return null.
	virtual const TargetRegisterInfo *getRegisterInfo() const { return nullptr; }

	/// If the information for the register banks is available, return it.
	/// Otherwise return nullptr.
	virtual const RegisterBankInfo *getRegBankInfo() const { return nullptr; }

	/// getInstrItineraryData - Returns instruction itinerary data for the target
	/// or specific subtarget.
	virtual const InstrItineraryData *getInstrItineraryData() const {
	return nullptr;
	}

	/// Resolve a SchedClass at runtime, where SchedClass identifies an
	/// MCSchedClassDesc with the isVariant property. This may return the ID of
	/// another variant SchedClass, but repeated invocation must quickly terminate
	/// in a nonvariant SchedClass.
	virtual unsigned resolveSchedClass(unsigned SchedClass,
	const MachineInstr *MI,
	const TargetSchedModel *SchedModel) const {
	return 0;
	}

	/// \brief True if the subtarget should run MachineScheduler after aggressive
	/// coalescing.
	///
	/// This currently replaces the SelectionDAG scheduler with the "source" order
	/// scheduler (though see below for an option to turn this off and use the
	/// TargetLowering preference). It does not yet disable the postRA scheduler.
	virtual bool enableMachineScheduler() const;

	/// \brief Support printing of [latency:throughput] comment in output .S file.
	virtual bool supportPrintSchedInfo() const { return false; }

	/// \brief True if the machine scheduler should disable the TLI preference
	/// for preRA scheduling with the source level scheduler.
	virtual bool enableMachineSchedDefaultSched() const { return true; }

	/// \brief True if the subtarget should enable joining global copies.
	///
	/// By default this is enabled if the machine scheduler is enabled, but
	/// can be overridden.
	virtual bool enableJoinGlobalCopies() const;

	/// True if the subtarget should run a scheduler after register allocation.
	///
	/// By default this queries the PostRAScheduling bit in the scheduling model
	/// which is the preferred way to influence this.
	virtual bool enablePostRAScheduler() const;

	/// \brief True if the subtarget should run the atomic expansion pass.
	virtual bool enableAtomicExpand() const;

	+ /// True if the subtarget should run the indirectbr expansion pass.
	+ virtual bool enableIndirectBrExpand() const;
	+
	/// \brief Override generic scheduling policy within a region.
	///
	/// This is a convenient way for targets that don't provide any custom
	/// scheduling heuristics (no custom MachineSchedStrategy) to make
	/// changes to the generic scheduling policy.
	virtual void overrideSchedPolicy(MachineSchedPolicy &Policy,
	unsigned NumRegionInstrs) const {}

	// \brief Perform target specific adjustments to the latency of a schedule
	// dependency.
	virtual void adjustSchedDependency(SUnit def, SUnit use, SDep &dep) const {}

	// For use with PostRAScheduling: get the anti-dependence breaking that should
	// be performed before post-RA scheduling.
	virtual AntiDepBreakMode getAntiDepBreakMode() const { return ANTIDEP_NONE; }

	// For use with PostRAScheduling: in CriticalPathRCs, return any register
	// classes that should only be considered for anti-dependence breaking if they
	// are on the critical path.
	virtual void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
	return CriticalPathRCs.clear();
	}

	// \brief Provide an ordered list of schedule DAG mutations for the post-RA
	// scheduler.
	virtual void getPostRAMutations(
	std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
	}

	// \brief Provide an ordered list of schedule DAG mutations for the machine
	// pipeliner.
	virtual void getSMSMutations(
	std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
	}

	// For use with PostRAScheduling: get the minimum optimization level needed
	// to enable post-RA scheduling.
	virtual CodeGenOpt::Level getOptLevelToEnablePostRAScheduler() const {
	return CodeGenOpt::Default;
	}

	/// \brief True if the subtarget should run the local reassignment
	/// heuristic of the register allocator.
	/// This heuristic may be compile time intensive, \p OptLevel provides
	/// a finer grain to tune the register allocator.
	virtual bool enableRALocalReassignment(CodeGenOpt::Level OptLevel) const;

	/// \brief True if the subtarget should consider the cost of local intervals
	/// created by a split candidate when choosing the best split candidate. This
	/// heuristic may be compile time intensive.
	virtual bool enableAdvancedRASplitCost() const;

	/// \brief Enable use of alias analysis during code generation (during MI
	/// scheduling, DAGCombine, etc.).
	virtual bool useAA() const;

	/// \brief Enable the use of the early if conversion pass.
	virtual bool enableEarlyIfConversion() const { return false; }

	/// \brief Return PBQPConstraint(s) for the target.
	///
	/// Override to provide custom PBQP constraints.
	virtual std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const {
	return nullptr;
	}

	/// Enable tracking of subregister liveness in register allocator.
	/// Please use MachineRegisterInfo::subRegLivenessEnabled() instead where
	/// possible.
	virtual bool enableSubRegLiveness() const { return false; }

	/// Returns string representation of scheduler comment
	std::string getSchedInfoStr(const MachineInstr &MI) const override;
	std::string getSchedInfoStr(MCInst const &MCI) const override;
	};

	} // end namespace llvm

	#endif // LLVM_CODEGEN_TARGETSUBTARGETINFO_H
	Index: head/contrib/llvm/include/llvm/InitializePasses.h
	===================================================================
	--- head/contrib/llvm/include/llvm/InitializePasses.h (revision 328816)
	+++ head/contrib/llvm/include/llvm/InitializePasses.h (revision 328817)
	@@ -1,388 +1,389 @@
	//===- llvm/InitializePasses.h - Initialize All Passes ----------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the declarations for the pass initialization routines
	// for the entire LLVM project.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_INITIALIZEPASSES_H
	#define LLVM_INITIALIZEPASSES_H

	namespace llvm {

	class PassRegistry;

	/// Initialize all passes linked into the TransformUtils library.
	void initializeCore(PassRegistry&);

	/// Initialize all passes linked into the TransformUtils library.
	void initializeTransformUtils(PassRegistry&);

	/// Initialize all passes linked into the ScalarOpts library.
	void initializeScalarOpts(PassRegistry&);

	/// Initialize all passes linked into the ObjCARCOpts library.
	void initializeObjCARCOpts(PassRegistry&);

	/// Initialize all passes linked into the Vectorize library.
	void initializeVectorization(PassRegistry&);

	/// Initialize all passes linked into the InstCombine library.
	void initializeInstCombine(PassRegistry&);

	/// Initialize all passes linked into the IPO library.
	void initializeIPO(PassRegistry&);

	/// Initialize all passes linked into the Instrumentation library.
	void initializeInstrumentation(PassRegistry&);

	/// Initialize all passes linked into the Analysis library.
	void initializeAnalysis(PassRegistry&);

	/// Initialize all passes linked into the Coroutines library.
	void initializeCoroutines(PassRegistry&);

	/// Initialize all passes linked into the CodeGen library.
	void initializeCodeGen(PassRegistry&);

	/// Initialize all passes linked into the GlobalISel library.
	void initializeGlobalISel(PassRegistry&);

	/// Initialize all passes linked into the CodeGen library.
	void initializeTarget(PassRegistry&);

	void initializeAAEvalLegacyPassPass(PassRegistry&);
	void initializeAAResultsWrapperPassPass(PassRegistry&);
	void initializeADCELegacyPassPass(PassRegistry&);
	void initializeAddDiscriminatorsLegacyPassPass(PassRegistry&);
	void initializeAddressSanitizerModulePass(PassRegistry&);
	void initializeAddressSanitizerPass(PassRegistry&);
	void initializeAliasSetPrinterPass(PassRegistry&);
	void initializeAlignmentFromAssumptionsPass(PassRegistry&);
	void initializeAlwaysInlinerLegacyPassPass(PassRegistry&);
	void initializeArgPromotionPass(PassRegistry&);
	void initializeAssumptionCacheTrackerPass(PassRegistry&);
	void initializeAtomicExpandPass(PassRegistry&);
	void initializeBDCELegacyPassPass(PassRegistry&);
	void initializeBarrierNoopPass(PassRegistry&);
	void initializeBasicAAWrapperPassPass(PassRegistry&);
	void initializeBlockExtractorPassPass(PassRegistry&);
	void initializeBlockFrequencyInfoWrapperPassPass(PassRegistry&);
	void initializeBoundsCheckingLegacyPassPass(PassRegistry&);
	void initializeBranchFolderPassPass(PassRegistry&);
	void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry&);
	void initializeBranchRelaxationPass(PassRegistry&);
	void initializeBreakCriticalEdgesPass(PassRegistry&);
	void initializeCallSiteSplittingLegacyPassPass(PassRegistry&);
	void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&);
	void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&);
	void initializeCFGPrinterLegacyPassPass(PassRegistry&);
	void initializeCFGSimplifyPassPass(PassRegistry&);
	void initializeCFGViewerLegacyPassPass(PassRegistry&);
	void initializeCFLAndersAAWrapperPassPass(PassRegistry&);
	void initializeCFLSteensAAWrapperPassPass(PassRegistry&);
	void initializeCallGraphDOTPrinterPass(PassRegistry&);
	void initializeCallGraphPrinterLegacyPassPass(PassRegistry&);
	void initializeCallGraphViewerPass(PassRegistry&);
	void initializeCallGraphWrapperPassPass(PassRegistry&);
	void initializeCodeGenPreparePass(PassRegistry&);
	void initializeConstantHoistingLegacyPassPass(PassRegistry&);
	void initializeCalledValuePropagationLegacyPassPass(PassRegistry &);
	void initializeConstantMergeLegacyPassPass(PassRegistry&);
	void initializeConstantPropagationPass(PassRegistry&);
	void initializeCorrelatedValuePropagationPass(PassRegistry&);
	void initializeCostModelAnalysisPass(PassRegistry&);
	void initializeEntryExitInstrumenterPass(PassRegistry&);
	void initializePostInlineEntryExitInstrumenterPass(PassRegistry&);
	void initializeCrossDSOCFIPass(PassRegistry&);
	void initializeDAEPass(PassRegistry&);
	void initializeDAHPass(PassRegistry&);
	void initializeDCELegacyPassPass(PassRegistry&);
	void initializeDSELegacyPassPass(PassRegistry&);
	void initializeDataFlowSanitizerPass(PassRegistry&);
	void initializeDeadInstEliminationPass(PassRegistry&);
	void initializeDeadMachineInstructionElimPass(PassRegistry&);
	void initializeDelinearizationPass(PassRegistry&);
	void initializeDemandedBitsWrapperPassPass(PassRegistry&);
	void initializeDependenceAnalysisPass(PassRegistry&);
	void initializeDependenceAnalysisWrapperPassPass(PassRegistry&);
	void initializeDetectDeadLanesPass(PassRegistry&);
	void initializeDivergenceAnalysisPass(PassRegistry&);
	void initializeDivRemPairsLegacyPassPass(PassRegistry&);
	void initializeDomOnlyPrinterPass(PassRegistry&);
	void initializeDomOnlyViewerPass(PassRegistry&);
	void initializeDomPrinterPass(PassRegistry&);
	void initializeDomViewerPass(PassRegistry&);
	void initializeDominanceFrontierWrapperPassPass(PassRegistry&);
	void initializeDominatorTreeWrapperPassPass(PassRegistry&);
	void initializeDwarfEHPreparePass(PassRegistry&);
	void initializeEarlyCSELegacyPassPass(PassRegistry&);
	void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry&);
	void initializeEarlyIfConverterPass(PassRegistry&);
	void initializeEdgeBundlesPass(PassRegistry&);
	void initializeEfficiencySanitizerPass(PassRegistry&);
	void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&);
	void initializeExpandISelPseudosPass(PassRegistry&);
	void initializeExpandMemCmpPassPass(PassRegistry&);
	void initializeExpandPostRAPass(PassRegistry&);
	void initializeExpandReductionsPass(PassRegistry&);
	void initializeExternalAAWrapperPassPass(PassRegistry&);
	void initializeFEntryInserterPass(PassRegistry&);
	void initializeFinalizeMachineBundlesPass(PassRegistry&);
	void initializeFlattenCFGPassPass(PassRegistry&);
	void initializeFloat2IntLegacyPassPass(PassRegistry&);
	void initializeForceFunctionAttrsLegacyPassPass(PassRegistry&);
	void initializeForwardControlFlowIntegrityPass(PassRegistry&);
	void initializeFuncletLayoutPass(PassRegistry&);
	void initializeFunctionImportLegacyPassPass(PassRegistry&);
	void initializeGCMachineCodeAnalysisPass(PassRegistry&);
	void initializeGCModuleInfoPass(PassRegistry&);
	void initializeGCOVProfilerLegacyPassPass(PassRegistry&);
	void initializeGVNHoistLegacyPassPass(PassRegistry&);
	void initializeGVNLegacyPassPass(PassRegistry&);
	void initializeGVNSinkLegacyPassPass(PassRegistry&);
	void initializeGlobalDCELegacyPassPass(PassRegistry&);
	void initializeGlobalMergePass(PassRegistry&);
	void initializeGlobalOptLegacyPassPass(PassRegistry&);
	void initializeGlobalSplitPass(PassRegistry&);
	void initializeGlobalsAAWrapperPassPass(PassRegistry&);
	void initializeGuardWideningLegacyPassPass(PassRegistry&);
	void initializeIPCPPass(PassRegistry&);
	void initializeIPSCCPLegacyPassPass(PassRegistry&);
	void initializeIRTranslatorPass(PassRegistry&);
	void initializeIVUsersWrapperPassPass(PassRegistry&);
	void initializeIfConverterPass(PassRegistry&);
	void initializeImplicitNullChecksPass(PassRegistry&);
	void initializeIndVarSimplifyLegacyPassPass(PassRegistry&);
	+void initializeIndirectBrExpandPassPass(PassRegistry&);
	void initializeInductiveRangeCheckEliminationPass(PassRegistry&);
	void initializeInferAddressSpacesPass(PassRegistry&);
	void initializeInferFunctionAttrsLegacyPassPass(PassRegistry&);
	void initializeInlineCostAnalysisPass(PassRegistry&);
	void initializeInstCountPass(PassRegistry&);
	void initializeInstNamerPass(PassRegistry&);
	void initializeInstSimplifierPass(PassRegistry&);
	void initializeInstrProfilingLegacyPassPass(PassRegistry&);
	void initializeInstructionCombiningPassPass(PassRegistry&);
	void initializeInstructionSelectPass(PassRegistry&);
	void initializeInterleavedAccessPass(PassRegistry&);
	void initializeInternalizeLegacyPassPass(PassRegistry&);
	void initializeIntervalPartitionPass(PassRegistry&);
	void initializeJumpThreadingPass(PassRegistry&);
	void initializeLCSSAVerificationPassPass(PassRegistry&);
	void initializeLCSSAWrapperPassPass(PassRegistry&);
	void initializeLazyBlockFrequencyInfoPassPass(PassRegistry&);
	void initializeLazyBranchProbabilityInfoPassPass(PassRegistry&);
	void initializeLazyMachineBlockFrequencyInfoPassPass(PassRegistry&);
	void initializeLazyValueInfoPrinterPass(PassRegistry&);
	void initializeLazyValueInfoWrapperPassPass(PassRegistry&);
	void initializeLegacyLICMPassPass(PassRegistry&);
	void initializeLegacyLoopSinkPassPass(PassRegistry&);
	void initializeLegalizerPass(PassRegistry&);
	void initializeLibCallsShrinkWrapLegacyPassPass(PassRegistry&);
	void initializeLintPass(PassRegistry&);
	void initializeLiveDebugValuesPass(PassRegistry&);
	void initializeLiveDebugVariablesPass(PassRegistry&);
	void initializeLiveIntervalsPass(PassRegistry&);
	void initializeLiveRangeShrinkPass(PassRegistry&);
	void initializeLiveRegMatrixPass(PassRegistry&);
	void initializeLiveStacksPass(PassRegistry&);
	void initializeLiveVariablesPass(PassRegistry&);
	void initializeLoadStoreVectorizerPass(PassRegistry&);
	void initializeLoaderPassPass(PassRegistry&);
	void initializeLocalStackSlotPassPass(PassRegistry&);
	void initializeLocalizerPass(PassRegistry&);
	void initializeLoopAccessLegacyAnalysisPass(PassRegistry&);
	void initializeLoopDataPrefetchLegacyPassPass(PassRegistry&);
	void initializeLoopDeletionLegacyPassPass(PassRegistry&);
	void initializeLoopDistributeLegacyPass(PassRegistry&);
	void initializeLoopExtractorPass(PassRegistry&);
	void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry&);
	void initializeLoopInfoWrapperPassPass(PassRegistry&);
	void initializeLoopInstSimplifyLegacyPassPass(PassRegistry&);
	void initializeLoopInterchangePass(PassRegistry&);
	void initializeLoopLoadEliminationPass(PassRegistry&);
	void initializeLoopPassPass(PassRegistry&);
	void initializeLoopPredicationLegacyPassPass(PassRegistry&);
	void initializeLoopRerollPass(PassRegistry&);
	void initializeLoopRotateLegacyPassPass(PassRegistry&);
	void initializeLoopSimplifyCFGLegacyPassPass(PassRegistry&);
	void initializeLoopSimplifyPass(PassRegistry&);
	void initializeLoopStrengthReducePass(PassRegistry&);
	void initializeLoopUnrollPass(PassRegistry&);
	void initializeLoopUnswitchPass(PassRegistry&);
	void initializeLoopVectorizePass(PassRegistry&);
	void initializeLoopVersioningLICMPass(PassRegistry&);
	void initializeLoopVersioningPassPass(PassRegistry&);
	void initializeLowerAtomicLegacyPassPass(PassRegistry&);
	void initializeLowerEmuTLSPass(PassRegistry&);
	void initializeLowerExpectIntrinsicPass(PassRegistry&);
	void initializeLowerGuardIntrinsicLegacyPassPass(PassRegistry&);
	void initializeLowerIntrinsicsPass(PassRegistry&);
	void initializeLowerInvokeLegacyPassPass(PassRegistry&);
	void initializeLowerSwitchPass(PassRegistry&);
	void initializeLowerTypeTestsPass(PassRegistry&);
	void initializeMIRPrintingPassPass(PassRegistry&);
	void initializeMachineBlockFrequencyInfoPass(PassRegistry&);
	void initializeMachineBlockPlacementPass(PassRegistry&);
	void initializeMachineBlockPlacementStatsPass(PassRegistry&);
	void initializeMachineBranchProbabilityInfoPass(PassRegistry&);
	void initializeMachineCSEPass(PassRegistry&);
	void initializeMachineCombinerPass(PassRegistry&);
	void initializeMachineCopyPropagationPass(PassRegistry&);
	void initializeMachineDominanceFrontierPass(PassRegistry&);
	void initializeMachineDominatorTreePass(PassRegistry&);
	void initializeMachineFunctionPrinterPassPass(PassRegistry&);
	void initializeMachineLICMPass(PassRegistry&);
	void initializeMachineLoopInfoPass(PassRegistry&);
	void initializeMachineModuleInfoPass(PassRegistry&);
	void initializeMachineOptimizationRemarkEmitterPassPass(PassRegistry&);
	void initializeMachineOutlinerPass(PassRegistry&);
	void initializeMachinePipelinerPass(PassRegistry&);
	void initializeMachinePostDominatorTreePass(PassRegistry&);
	void initializeMachineRegionInfoPassPass(PassRegistry&);
	void initializeMachineSchedulerPass(PassRegistry&);
	void initializeMachineSinkingPass(PassRegistry&);
	void initializeMachineTraceMetricsPass(PassRegistry&);
	void initializeMachineVerifierPassPass(PassRegistry&);
	void initializeMemCpyOptLegacyPassPass(PassRegistry&);
	void initializeMemDepPrinterPass(PassRegistry&);
	void initializeMemDerefPrinterPass(PassRegistry&);
	void initializeMemoryDependenceWrapperPassPass(PassRegistry&);
	void initializeMemorySSAPrinterLegacyPassPass(PassRegistry&);
	void initializeMemorySSAWrapperPassPass(PassRegistry&);
	void initializeMemorySanitizerPass(PassRegistry&);
	void initializeMergeFunctionsPass(PassRegistry&);
	void initializeMergeICmpsPass(PassRegistry&);
	void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
	void initializeMetaRenamerPass(PassRegistry&);
	void initializeModuleDebugInfoPrinterPass(PassRegistry&);
	void initializeModuleSummaryIndexWrapperPassPass(PassRegistry&);
	void initializeNameAnonGlobalLegacyPassPass(PassRegistry&);
	void initializeNaryReassociateLegacyPassPass(PassRegistry&);
	void initializeNewGVNLegacyPassPass(PassRegistry&);
	void initializeObjCARCAAWrapperPassPass(PassRegistry&);
	void initializeObjCARCAPElimPass(PassRegistry&);
	void initializeObjCARCContractPass(PassRegistry&);
	void initializeObjCARCExpandPass(PassRegistry&);
	void initializeObjCARCOptPass(PassRegistry&);
	void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry&);
	void initializeOptimizePHIsPass(PassRegistry&);
	void initializePAEvalPass(PassRegistry&);
	void initializePEIPass(PassRegistry&);
	void initializePGOIndirectCallPromotionLegacyPassPass(PassRegistry&);
	void initializePGOInstrumentationGenLegacyPassPass(PassRegistry&);
	void initializePGOInstrumentationUseLegacyPassPass(PassRegistry&);
	void initializePGOMemOPSizeOptLegacyPassPass(PassRegistry&);
	void initializePHIEliminationPass(PassRegistry&);
	void initializePartialInlinerLegacyPassPass(PassRegistry&);
	void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry&);
	void initializePatchableFunctionPass(PassRegistry&);
	void initializePeepholeOptimizerPass(PassRegistry&);
	void initializePhysicalRegisterUsageInfoPass(PassRegistry&);
	void initializePlaceBackedgeSafepointsImplPass(PassRegistry&);
	void initializePlaceSafepointsPass(PassRegistry&);
	void initializePostDomOnlyPrinterPass(PassRegistry&);
	void initializePostDomOnlyViewerPass(PassRegistry&);
	void initializePostDomPrinterPass(PassRegistry&);
	void initializePostDomViewerPass(PassRegistry&);
	void initializePostDominatorTreeWrapperPassPass(PassRegistry&);
	void initializePostMachineSchedulerPass(PassRegistry&);
	void initializePostOrderFunctionAttrsLegacyPassPass(PassRegistry&);
	void initializePostRAHazardRecognizerPass(PassRegistry&);
	void initializePostRASchedulerPass(PassRegistry&);
	void initializePreISelIntrinsicLoweringLegacyPassPass(PassRegistry&);
	void initializePredicateInfoPrinterLegacyPassPass(PassRegistry&);
	void initializePrintBasicBlockPassPass(PassRegistry&);
	void initializePrintFunctionPassWrapperPass(PassRegistry&);
	void initializePrintModulePassWrapperPass(PassRegistry&);
	void initializeProcessImplicitDefsPass(PassRegistry&);
	void initializeProfileSummaryInfoWrapperPassPass(PassRegistry&);
	void initializePromoteLegacyPassPass(PassRegistry&);
	void initializePruneEHPass(PassRegistry&);
	void initializeRABasicPass(PassRegistry&);
	void initializeRegAllocFastPass(PassRegistry&);
	void initializeRAGreedyPass(PassRegistry&);
	void initializeReassociateLegacyPassPass(PassRegistry&);
	void initializeRegBankSelectPass(PassRegistry&);
	void initializeRegToMemPass(PassRegistry&);
	void initializeRegionInfoPassPass(PassRegistry&);
	void initializeRegionOnlyPrinterPass(PassRegistry&);
	void initializeRegionOnlyViewerPass(PassRegistry&);
	void initializeRegionPrinterPass(PassRegistry&);
	void initializeRegionViewerPass(PassRegistry&);
	void initializeRegisterCoalescerPass(PassRegistry&);
	void initializeRenameIndependentSubregsPass(PassRegistry&);
	void initializeResetMachineFunctionPass(PassRegistry&);
	void initializeReversePostOrderFunctionAttrsLegacyPassPass(PassRegistry&);
	void initializeRewriteStatepointsForGCLegacyPassPass(PassRegistry &);
	void initializeRewriteSymbolsLegacyPassPass(PassRegistry&);
	void initializeSafepointIRVerifierPass(PassRegistry&);
	void initializeSCCPLegacyPassPass(PassRegistry&);
	void initializeSCEVAAWrapperPassPass(PassRegistry&);
	void initializeSLPVectorizerPass(PassRegistry&);
	void initializeSROALegacyPassPass(PassRegistry&);
	void initializeSafeStackLegacyPassPass(PassRegistry&);
	void initializeSampleProfileLoaderLegacyPassPass(PassRegistry&);
	void initializeSanitizerCoverageModulePass(PassRegistry&);
	void initializeScalarEvolutionWrapperPassPass(PassRegistry&);
	void initializeScalarizeMaskedMemIntrinPass(PassRegistry&);
	void initializeScalarizerPass(PassRegistry&);
	void initializeScavengerTestPass(PassRegistry&);
	void initializeScopedNoAliasAAWrapperPassPass(PassRegistry&);
	void initializeSeparateConstOffsetFromGEPPass(PassRegistry&);
	void initializeShadowStackGCLoweringPass(PassRegistry&);
	void initializeShrinkWrapPass(PassRegistry&);
	void initializeSimpleInlinerPass(PassRegistry&);
	void initializeSimpleLoopUnswitchLegacyPassPass(PassRegistry&);
	void initializeSingleLoopExtractorPass(PassRegistry&);
	void initializeSinkingLegacyPassPass(PassRegistry&);
	void initializeSjLjEHPreparePass(PassRegistry&);
	void initializeSlotIndexesPass(PassRegistry&);
	void initializeSpeculativeExecutionLegacyPassPass(PassRegistry&);
	void initializeSpillPlacementPass(PassRegistry&);
	void initializeStackColoringPass(PassRegistry&);
	void initializeStackMapLivenessPass(PassRegistry&);
	void initializeStackProtectorPass(PassRegistry&);
	void initializeStackSlotColoringPass(PassRegistry&);
	void initializeStraightLineStrengthReducePass(PassRegistry&);
	void initializeStripDeadDebugInfoPass(PassRegistry&);
	void initializeStripDeadPrototypesLegacyPassPass(PassRegistry&);
	void initializeStripDebugDeclarePass(PassRegistry&);
	void initializeStripGCRelocatesPass(PassRegistry&);
	void initializeStripNonDebugSymbolsPass(PassRegistry&);
	void initializeStripNonLineTableDebugInfoPass(PassRegistry&);
	void initializeStripSymbolsPass(PassRegistry&);
	void initializeStructurizeCFGPass(PassRegistry&);
	void initializeHWAddressSanitizerPass(PassRegistry&);
	void initializeTailCallElimPass(PassRegistry&);
	void initializeTailDuplicatePassPass(PassRegistry&);
	void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&);
	void initializeTargetPassConfigPass(PassRegistry&);
	void initializeTargetTransformInfoWrapperPassPass(PassRegistry&);
	void initializeThreadSanitizerPass(PassRegistry&);
	void initializeTwoAddressInstructionPassPass(PassRegistry&);
	void initializeTypeBasedAAWrapperPassPass(PassRegistry&);
	void initializeUnifyFunctionExitNodesPass(PassRegistry&);
	void initializeUnpackMachineBundlesPass(PassRegistry&);
	void initializeUnreachableBlockElimLegacyPassPass(PassRegistry&);
	void initializeUnreachableMachineBlockElimPass(PassRegistry&);
	void initializeVerifierLegacyPassPass(PassRegistry&);
	void initializeVirtRegMapPass(PassRegistry&);
	void initializeVirtRegRewriterPass(PassRegistry&);
	void initializeWholeProgramDevirtPass(PassRegistry&);
	void initializeWinEHPreparePass(PassRegistry&);
	void initializeWriteBitcodePassPass(PassRegistry&);
	void initializeWriteThinLTOBitcodePass(PassRegistry&);
	void initializeXRayInstrumentationPass(PassRegistry&);
	void initializeMIRCanonicalizerPass(PassRegistry &);

	} // end namespace llvm

	#endif // LLVM_INITIALIZEPASSES_H
	Index: head/contrib/llvm/lib/CodeGen/CodeGen.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/CodeGen.cpp (revision 328816)
	+++ head/contrib/llvm/lib/CodeGen/CodeGen.cpp (revision 328817)
	@@ -1,107 +1,108 @@
	//===-- CodeGen.cpp -------------------------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the common initialization routines for the
	// CodeGen library.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm-c/Initialization.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/PassRegistry.h"

	using namespace llvm;

	/// initializeCodeGen - Initialize all passes linked into the CodeGen library.
	void llvm::initializeCodeGen(PassRegistry &Registry) {
	initializeAtomicExpandPass(Registry);
	initializeBranchFolderPassPass(Registry);
	initializeBranchRelaxationPass(Registry);
	initializeCodeGenPreparePass(Registry);
	initializeDeadMachineInstructionElimPass(Registry);
	initializeDetectDeadLanesPass(Registry);
	initializeDwarfEHPreparePass(Registry);
	initializeEarlyIfConverterPass(Registry);
	initializeExpandISelPseudosPass(Registry);
	initializeExpandMemCmpPassPass(Registry);
	initializeExpandPostRAPass(Registry);
	initializeFEntryInserterPass(Registry);
	initializeFinalizeMachineBundlesPass(Registry);
	initializeFuncletLayoutPass(Registry);
	initializeGCMachineCodeAnalysisPass(Registry);
	initializeGCModuleInfoPass(Registry);
	initializeIfConverterPass(Registry);
	initializeImplicitNullChecksPass(Registry);
	+ initializeIndirectBrExpandPassPass(Registry);
	initializeInterleavedAccessPass(Registry);
	initializeLiveDebugValuesPass(Registry);
	initializeLiveDebugVariablesPass(Registry);
	initializeLiveIntervalsPass(Registry);
	initializeLiveRangeShrinkPass(Registry);
	initializeLiveStacksPass(Registry);
	initializeLiveVariablesPass(Registry);
	initializeLocalStackSlotPassPass(Registry);
	initializeLowerIntrinsicsPass(Registry);
	initializeMachineBlockFrequencyInfoPass(Registry);
	initializeMachineBlockPlacementPass(Registry);
	initializeMachineBlockPlacementStatsPass(Registry);
	initializeMachineCSEPass(Registry);
	initializeMachineCombinerPass(Registry);
	initializeMachineCopyPropagationPass(Registry);
	initializeMachineDominatorTreePass(Registry);
	initializeMachineFunctionPrinterPassPass(Registry);
	initializeMachineLICMPass(Registry);
	initializeMachineLoopInfoPass(Registry);
	initializeMachineModuleInfoPass(Registry);
	initializeMachineOptimizationRemarkEmitterPassPass(Registry);
	initializeMachineOutlinerPass(Registry);
	initializeMachinePipelinerPass(Registry);
	initializeMachinePostDominatorTreePass(Registry);
	initializeMachineRegionInfoPassPass(Registry);
	initializeMachineSchedulerPass(Registry);
	initializeMachineSinkingPass(Registry);
	initializeMachineVerifierPassPass(Registry);
	initializeOptimizePHIsPass(Registry);
	initializePEIPass(Registry);
	initializePHIEliminationPass(Registry);
	initializePatchableFunctionPass(Registry);
	initializePeepholeOptimizerPass(Registry);
	initializePostMachineSchedulerPass(Registry);
	initializePostRAHazardRecognizerPass(Registry);
	initializePostRASchedulerPass(Registry);
	initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
	initializeProcessImplicitDefsPass(Registry);
	initializeRABasicPass(Registry);
	initializeRegAllocFastPass(Registry);
	initializeRAGreedyPass(Registry);
	initializeRegisterCoalescerPass(Registry);
	initializeRenameIndependentSubregsPass(Registry);
	initializeSafeStackLegacyPassPass(Registry);
	initializeScalarizeMaskedMemIntrinPass(Registry);
	initializeShrinkWrapPass(Registry);
	initializeSlotIndexesPass(Registry);
	initializeStackColoringPass(Registry);
	initializeStackMapLivenessPass(Registry);
	initializeStackProtectorPass(Registry);
	initializeStackSlotColoringPass(Registry);
	initializeTailDuplicatePassPass(Registry);
	initializeTargetPassConfigPass(Registry);
	initializeTwoAddressInstructionPassPass(Registry);
	initializeUnpackMachineBundlesPass(Registry);
	initializeUnreachableBlockElimLegacyPassPass(Registry);
	initializeUnreachableMachineBlockElimPass(Registry);
	initializeVirtRegMapPass(Registry);
	initializeVirtRegRewriterPass(Registry);
	initializeWinEHPreparePass(Registry);
	initializeXRayInstrumentationPass(Registry);
	initializeMIRCanonicalizerPass(Registry);
	}

	void LLVMInitializeCodeGen(LLVMPassRegistryRef R) {
	initializeCodeGen(*unwrap(R));
	}
	Index: head/contrib/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/IndirectBrExpandPass.cpp (nonexistent)
	+++ head/contrib/llvm/lib/CodeGen/IndirectBrExpandPass.cpp (revision 328817)
	@@ -0,0 +1,221 @@
	+//===- IndirectBrExpandPass.cpp - Expand indirectbr to switch -------------===//
	+//
	+// The LLVM Compiler Infrastructure
	+//
	+// This file is distributed under the University of Illinois Open Source
	+// License. See LICENSE.TXT for details.
	+//
	+//===----------------------------------------------------------------------===//
	+/// \file
	+///
	+/// Implements an expansion pass to turn `indirectbr` instructions in the IR
	+/// into `switch` instructions. This works by enumerating the basic blocks in
	+/// a dense range of integers, replacing each `blockaddr` constant with the
	+/// corresponding integer constant, and then building a switch that maps from
	+/// the integers to the actual blocks. All of the indirectbr instructions in the
	+/// function are redirected to this common switch.
	+///
	+/// While this is generically useful if a target is unable to codegen
	+/// `indirectbr` natively, it is primarily useful when there is some desire to
	+/// get the builtin non-jump-table lowering of a switch even when the input
	+/// source contained an explicit indirect branch construct.
	+///
	+/// Note that it doesn't make any sense to enable this pass unless a target also
	+/// disables jump-table lowering of switches. Doing that is likely to pessimize
	+/// the code.
	+///
	+//===----------------------------------------------------------------------===//
	+
	+#include "llvm/ADT/STLExtras.h"
	+#include "llvm/ADT/Sequence.h"
	+#include "llvm/ADT/SmallVector.h"
	+#include "llvm/CodeGen/TargetPassConfig.h"
	+#include "llvm/CodeGen/TargetSubtargetInfo.h"
	+#include "llvm/IR/BasicBlock.h"
	+#include "llvm/IR/Function.h"
	+#include "llvm/IR/IRBuilder.h"
	+#include "llvm/IR/InstIterator.h"
	+#include "llvm/IR/Instruction.h"
	+#include "llvm/IR/Instructions.h"
	+#include "llvm/Pass.h"
	+#include "llvm/Support/Debug.h"
	+#include "llvm/Support/ErrorHandling.h"
	+#include "llvm/Support/raw_ostream.h"
	+#include "llvm/Target/TargetMachine.h"
	+
	+using namespace llvm;
	+
	+#define DEBUG_TYPE "indirectbr-expand"
	+
	+namespace {
	+
	+class IndirectBrExpandPass : public FunctionPass {
	+ const TargetLowering *TLI = nullptr;
	+
	+public:
	+ static char ID; // Pass identification, replacement for typeid
	+
	+ IndirectBrExpandPass() : FunctionPass(ID) {
	+ initializeIndirectBrExpandPassPass(*PassRegistry::getPassRegistry());
	+ }
	+
	+ bool runOnFunction(Function &F) override;
	+};
	+
	+} // end anonymous namespace
	+
	+char IndirectBrExpandPass::ID = 0;
	+
	+INITIALIZE_PASS(IndirectBrExpandPass, DEBUG_TYPE,
	+ "Expand indirectbr instructions", false, false)
	+
	+FunctionPass *llvm::createIndirectBrExpandPass() {
	+ return new IndirectBrExpandPass();
	+}
	+
	+bool IndirectBrExpandPass::runOnFunction(Function &F) {
	+ auto &DL = F.getParent()->getDataLayout();
	+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
	+ if (!TPC)
	+ return false;
	+
	+ auto &TM = TPC->getTM<TargetMachine>();
	+ auto &STI = *TM.getSubtargetImpl(F);
	+ if (!STI.enableIndirectBrExpand())
	+ return false;
	+ TLI = STI.getTargetLowering();
	+
	+ SmallVector<IndirectBrInst *, 1> IndirectBrs;
	+
	+ // Set of all potential successors for indirectbr instructions.
	+ SmallPtrSet<BasicBlock *, 4> IndirectBrSuccs;
	+
	+ // Build a list of indirectbrs that we want to rewrite.
	+ for (BasicBlock &BB : F)
	+ if (auto *IBr = dyn_cast<IndirectBrInst>(BB.getTerminator())) {
	+ // Handle the degenerate case of no successors by replacing the indirectbr
	+ // with unreachable as there is no successor available.
	+ if (IBr->getNumSuccessors() == 0) {
	+ (void)new UnreachableInst(F.getContext(), IBr);
	+ IBr->eraseFromParent();
	+ continue;
	+ }
	+
	+ IndirectBrs.push_back(IBr);
	+ for (BasicBlock *SuccBB : IBr->successors())
	+ IndirectBrSuccs.insert(SuccBB);
	+ }
	+
	+ if (IndirectBrs.empty())
	+ return false;
	+
	+ // If we need to replace any indirectbrs we need to establish integer
	+ // constants that will correspond to each of the basic blocks in the function
	+ // whose address escapes. We do that here and rewrite all the blockaddress
	+ // constants to just be those integer constants cast to a pointer type.
	+ SmallVector<BasicBlock *, 4> BBs;
	+
	+ for (BasicBlock &BB : F) {
	+ // Skip blocks that aren't successors to an indirectbr we're going to
	+ // rewrite.
	+ if (!IndirectBrSuccs.count(&BB))
	+ continue;
	+
	+ auto IsBlockAddressUse = [&](const Use &U) {
	+ return isa<BlockAddress>(U.getUser());
	+ };
	+ auto BlockAddressUseIt = llvm::find_if(BB.uses(), IsBlockAddressUse);
	+ if (BlockAddressUseIt == BB.use_end())
	+ continue;
	+
	+ assert(std::find_if(std::next(BlockAddressUseIt), BB.use_end(),
	+ IsBlockAddressUse) == BB.use_end() &&
	+ "There should only ever be a single blockaddress use because it is "
	+ "a constant and should be uniqued.");
	+
	+ auto *BA = cast<BlockAddress>(BlockAddressUseIt->getUser());
	+
	+ // Skip if the constant was formed but ended up not being used (due to DCE
	+ // or whatever).
	+ if (!BA->isConstantUsed())
	+ continue;
	+
	+ // Compute the index we want to use for this basic block. We can't use zero
	+ // because null can be compared with block addresses.
	+ int BBIndex = BBs.size() + 1;
	+ BBs.push_back(&BB);
	+
	+ auto *ITy = cast<IntegerType>(DL.getIntPtrType(BA->getType()));
	+ ConstantInt *BBIndexC = ConstantInt::get(ITy, BBIndex);
	+
	+ // Now rewrite the blockaddress to an integer constant based on the index.
	+ // FIXME: We could potentially preserve the uses as arguments to inline asm.
	+ // This would allow some uses such as diagnostic information in crashes to
	+ // have higher quality even when this transform is enabled, but would break
	+ // users that round-trip blockaddresses through inline assembly and then
	+ // back into an indirectbr.
	+ BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(BBIndexC, BA->getType()));
	+ }
	+
	+ if (BBs.empty()) {
	+ // There are no blocks whose address is taken, so any indirectbr instruction
	+ // cannot get a valid input and we can replace all of them with unreachable.
	+ for (auto *IBr : IndirectBrs) {
	+ (void)new UnreachableInst(F.getContext(), IBr);
	+ IBr->eraseFromParent();
	+ }
	+ return true;
	+ }
	+
	+ BasicBlock *SwitchBB;
	+ Value *SwitchValue;
	+
	+ // Compute a common integer type across all the indirectbr instructions.
	+ IntegerType *CommonITy = nullptr;
	+ for (auto *IBr : IndirectBrs) {
	+ auto *ITy =
	+ cast<IntegerType>(DL.getIntPtrType(IBr->getAddress()->getType()));
	+ if (!CommonITy \|\| ITy->getBitWidth() > CommonITy->getBitWidth())
	+ CommonITy = ITy;
	+ }
	+
	+ auto GetSwitchValue = [DL, CommonITy](IndirectBrInst *IBr) {
	+ return CastInst::CreatePointerCast(
	+ IBr->getAddress(), CommonITy,
	+ Twine(IBr->getAddress()->getName()) + ".switch_cast", IBr);
	+ };
	+
	+ if (IndirectBrs.size() == 1) {
	+ // If we only have one indirectbr, we can just directly replace it within
	+ // its block.
	+ SwitchBB = IndirectBrs[0]->getParent();
	+ SwitchValue = GetSwitchValue(IndirectBrs[0]);
	+ IndirectBrs[0]->eraseFromParent();
	+ } else {
	+ // Otherwise we need to create a new block to hold the switch across BBs,
	+ // jump to that block instead of each indirectbr, and phi together the
	+ // values for the switch.
	+ SwitchBB = BasicBlock::Create(F.getContext(), "switch_bb", &F);
	+ auto *SwitchPN = PHINode::Create(CommonITy, IndirectBrs.size(),
	+ "switch_value_phi", SwitchBB);
	+ SwitchValue = SwitchPN;
	+
	+ // Now replace the indirectbr instructions with direct branches to the
	+ // switch block and fill out the PHI operands.
	+ for (auto *IBr : IndirectBrs) {
	+ SwitchPN->addIncoming(GetSwitchValue(IBr), IBr->getParent());
	+ BranchInst::Create(SwitchBB, IBr);
	+ IBr->eraseFromParent();
	+ }
	+ }
	+
	+ // Now build the switch in the block. The block will have no terminator
	+ // already.
	+ auto *SI = SwitchInst::Create(SwitchValue, BBs[0], BBs.size(), SwitchBB);
	+
	+ // Add a case for each block.
	+ for (int i : llvm::seq<int>(1, BBs.size()))
	+ SI->addCase(ConstantInt::get(CommonITy, i + 1), BBs[i]);
	+
	+ return true;
	+}

	Property changes on: head/contrib/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp (revision 328816)
	+++ head/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp (revision 328817)
	@@ -1,4708 +1,4710 @@
	//===- LegalizeDAG.cpp - Implement SelectionDAG::Legalize -----------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the SelectionDAG::Legalize method.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetFrameLowering.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Type.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <tuple>
	#include <utility>

	using namespace llvm;

	#define DEBUG_TYPE "legalizedag"

	namespace {

	/// Keeps track of state when getting the sign of a floating-point value as an
	/// integer.
	struct FloatSignAsInt {
	EVT FloatVT;
	SDValue Chain;
	SDValue FloatPtr;
	SDValue IntPtr;
	MachinePointerInfo IntPointerInfo;
	MachinePointerInfo FloatPointerInfo;
	SDValue IntValue;
	APInt SignMask;
	uint8_t SignBit;
	};

	//===----------------------------------------------------------------------===//
	/// This takes an arbitrary SelectionDAG as input and
	/// hacks on it until the target machine can handle it. This involves
	/// eliminating value sizes the machine cannot handle (promoting small sizes to
	/// large sizes or splitting up large values into small values) as well as
	/// eliminating operations the machine cannot handle.
	///
	/// This code also does a small amount of optimization and recognition of idioms
	/// as part of its processing. For example, if a target does not support a
	/// 'setcc' instruction efficiently, but does support 'brcc' instruction, this
	/// will attempt merge setcc and brc instructions into brcc's.
	class SelectionDAGLegalize {
	const TargetMachine &TM;
	const TargetLowering &TLI;
	SelectionDAG &DAG;

	/// \brief The set of nodes which have already been legalized. We hold a
	/// reference to it in order to update as necessary on node deletion.
	SmallPtrSetImpl<SDNode *> &LegalizedNodes;

	/// \brief A set of all the nodes updated during legalization.
	SmallSetVector<SDNode , 16> UpdatedNodes;

	EVT getSetCCResultType(EVT VT) const {
	return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
	}

	// Libcall insertion helpers.

	public:
	SelectionDAGLegalize(SelectionDAG &DAG,
	SmallPtrSetImpl<SDNode *> &LegalizedNodes,
	SmallSetVector<SDNode , 16> UpdatedNodes = nullptr)
	: TM(DAG.getTarget()), TLI(DAG.getTargetLoweringInfo()), DAG(DAG),
	LegalizedNodes(LegalizedNodes), UpdatedNodes(UpdatedNodes) {}

	/// \brief Legalizes the given operation.
	void LegalizeOp(SDNode *Node);

	private:
	SDValue OptimizeFloatStore(StoreSDNode *ST);

	void LegalizeLoadOps(SDNode *Node);
	void LegalizeStoreOps(SDNode *Node);

	/// Some targets cannot handle a variable
	/// insertion index for the INSERT_VECTOR_ELT instruction. In this case, it
	/// is necessary to spill the vector being inserted into to memory, perform
	/// the insert there, and then read the result back.
	SDValue PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
	const SDLoc &dl);
	SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx,
	const SDLoc &dl);

	/// Return a vector shuffle operation which
	/// performs the same shuffe in terms of order or result bytes, but on a type
	/// whose vector element type is narrower than the original shuffle type.
	/// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
	SDValue ShuffleWithNarrowerEltType(EVT NVT, EVT VT, const SDLoc &dl,
	SDValue N1, SDValue N2,
	ArrayRef<int> Mask) const;

	bool LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
	bool &NeedInvert, const SDLoc &dl);

	SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
	SDValue ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, const SDValue *Ops,
	unsigned NumOps, bool isSigned, const SDLoc &dl);

	std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
	SDNode *Node, bool isSigned);
	SDValue ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32,
	RTLIB::Libcall Call_F64, RTLIB::Libcall Call_F80,
	RTLIB::Libcall Call_F128,
	RTLIB::Libcall Call_PPCF128);
	SDValue ExpandIntLibCall(SDNode *Node, bool isSigned,
	RTLIB::Libcall Call_I8,
	RTLIB::Libcall Call_I16,
	RTLIB::Libcall Call_I32,
	RTLIB::Libcall Call_I64,
	RTLIB::Libcall Call_I128);
	void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
	void ExpandSinCosLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);

	SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
	const SDLoc &dl);
	SDValue ExpandBUILD_VECTOR(SDNode *Node);
	SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node);
	void ExpandDYNAMIC_STACKALLOC(SDNode *Node,
	SmallVectorImpl<SDValue> &Results);
	void getSignAsIntValue(FloatSignAsInt &State, const SDLoc &DL,
	SDValue Value) const;
	SDValue modifySignAsInt(const FloatSignAsInt &State, const SDLoc &DL,
	SDValue NewIntValue) const;
	SDValue ExpandFCOPYSIGN(SDNode *Node) const;
	SDValue ExpandFABS(SDNode *Node) const;
	SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue LegalOp, EVT DestVT,
	const SDLoc &dl);
	SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned,
	const SDLoc &dl);
	SDValue PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT, bool isSigned,
	const SDLoc &dl);

	SDValue ExpandBITREVERSE(SDValue Op, const SDLoc &dl);
	SDValue ExpandBSWAP(SDValue Op, const SDLoc &dl);
	SDValue ExpandBitCount(unsigned Opc, SDValue Op, const SDLoc &dl);

	SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
	SDValue ExpandInsertToVectorThroughStack(SDValue Op);
	SDValue ExpandVectorBuildThroughStack(SDNode* Node);

	SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
	SDValue ExpandConstant(ConstantSDNode *CP);

	// if ExpandNode returns false, LegalizeOp falls back to ConvertNodeToLibcall
	bool ExpandNode(SDNode *Node);
	void ConvertNodeToLibcall(SDNode *Node);
	void PromoteNode(SDNode *Node);

	public:
	// Node replacement helpers

	void ReplacedNode(SDNode *N) {
	LegalizedNodes.erase(N);
	if (UpdatedNodes)
	UpdatedNodes->insert(N);
	}

	void ReplaceNode(SDNode Old, SDNode New) {
	DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
	dbgs() << " with: "; New->dump(&DAG));

	assert(Old->getNumValues() == New->getNumValues() &&
	"Replacing one node with another that produces a different number "
	"of values!");
	DAG.ReplaceAllUsesWith(Old, New);
	if (UpdatedNodes)
	UpdatedNodes->insert(New);
	ReplacedNode(Old);
	}

	void ReplaceNode(SDValue Old, SDValue New) {
	DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
	dbgs() << " with: "; New->dump(&DAG));

	DAG.ReplaceAllUsesWith(Old, New);
	if (UpdatedNodes)
	UpdatedNodes->insert(New.getNode());
	ReplacedNode(Old.getNode());
	}

	void ReplaceNode(SDNode Old, const SDValue New) {
	DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG));

	DAG.ReplaceAllUsesWith(Old, New);
	for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i) {
	DEBUG(dbgs() << (i == 0 ? " with: "
	: " and: ");
	New[i]->dump(&DAG));
	if (UpdatedNodes)
	UpdatedNodes->insert(New[i].getNode());
	}
	ReplacedNode(Old);
	}
	};

	} // end anonymous namespace

	/// Return a vector shuffle operation which
	/// performs the same shuffe in terms of order or result bytes, but on a type
	/// whose vector element type is narrower than the original shuffle type.
	/// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
	SDValue SelectionDAGLegalize::ShuffleWithNarrowerEltType(
	EVT NVT, EVT VT, const SDLoc &dl, SDValue N1, SDValue N2,
	ArrayRef<int> Mask) const {
	unsigned NumMaskElts = VT.getVectorNumElements();
	unsigned NumDestElts = NVT.getVectorNumElements();
	unsigned NumEltsGrowth = NumDestElts / NumMaskElts;

	assert(NumEltsGrowth && "Cannot promote to vector type with fewer elts!");

	if (NumEltsGrowth == 1)
	return DAG.getVectorShuffle(NVT, dl, N1, N2, Mask);

	SmallVector<int, 8> NewMask;
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int Idx = Mask[i];
	for (unsigned j = 0; j != NumEltsGrowth; ++j) {
	if (Idx < 0)
	NewMask.push_back(-1);
	else
	NewMask.push_back(Idx * NumEltsGrowth + j);
	}
	}
	assert(NewMask.size() == NumDestElts && "Non-integer NumEltsGrowth?");
	assert(TLI.isShuffleMaskLegal(NewMask, NVT) && "Shuffle not legal?");
	return DAG.getVectorShuffle(NVT, dl, N1, N2, NewMask);
	}

	/// Expands the ConstantFP node to an integer constant or
	/// a load from the constant pool.
	SDValue
	SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
	bool Extend = false;
	SDLoc dl(CFP);

	// If a FP immediate is precise when represented as a float and if the
	// target can do an extending load from float to double, we put it into
	// the constant pool as a float, even if it's is statically typed as a
	// double. This shrinks FP constants and canonicalizes them for targets where
	// an FP extending load is the same cost as a normal load (such as on the x87
	// fp stack or PPC FP unit).
	EVT VT = CFP->getValueType(0);
	ConstantFP LLVMC = const_cast<ConstantFP>(CFP->getConstantFPValue());
	if (!UseCP) {
	assert((VT == MVT::f64 \|\| VT == MVT::f32) && "Invalid type expansion");
	return DAG.getConstant(LLVMC->getValueAPF().bitcastToAPInt(), dl,
	(VT == MVT::f64) ? MVT::i64 : MVT::i32);
	}

	APFloat APF = CFP->getValueAPF();
	EVT OrigVT = VT;
	EVT SVT = VT;

	// We don't want to shrink SNaNs. Converting the SNaN back to its real type
	// can cause it to be changed into a QNaN on some platforms (e.g. on SystemZ).
	if (!APF.isSignaling()) {
	while (SVT != MVT::f32 && SVT != MVT::f16) {
	SVT = (MVT::SimpleValueType)(SVT.getSimpleVT().SimpleTy - 1);
	if (ConstantFPSDNode::isValueValidForType(SVT, APF) &&
	// Only do this if the target has a native EXTLOAD instruction from
	// smaller type.
	TLI.isLoadExtLegal(ISD::EXTLOAD, OrigVT, SVT) &&
	TLI.ShouldShrinkFPConstant(OrigVT)) {
	Type SType = SVT.getTypeForEVT(DAG.getContext());
	LLVMC = cast<ConstantFP>(ConstantExpr::getFPTrunc(LLVMC, SType));
	VT = SVT;
	Extend = true;
	}
	}
	}

	SDValue CPIdx =
	DAG.getConstantPool(LLVMC, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
	if (Extend) {
	SDValue Result = DAG.getExtLoad(
	ISD::EXTLOAD, dl, OrigVT, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), VT,
	Alignment);
	return Result;
	}
	SDValue Result = DAG.getLoad(
	OrigVT, dl, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
	return Result;
	}

	/// Expands the Constant node to a load from the constant pool.
	SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) {
	SDLoc dl(CP);
	EVT VT = CP->getValueType(0);
	SDValue CPIdx = DAG.getConstantPool(CP->getConstantIntValue(),
	TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
	SDValue Result = DAG.getLoad(
	VT, dl, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
	return Result;
	}

	/// Some target cannot handle a variable insertion index for the
	/// INSERT_VECTOR_ELT instruction. In this case, it
	/// is necessary to spill the vector being inserted into to memory, perform
	/// the insert there, and then read the result back.
	SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec,
	SDValue Val,
	SDValue Idx,
	const SDLoc &dl) {
	SDValue Tmp1 = Vec;
	SDValue Tmp2 = Val;
	SDValue Tmp3 = Idx;

	// If the target doesn't support this, we have to spill the input vector
	// to a temporary stack slot, update the element, then reload it. This is
	// badness. We could also load the value into a vector register (either
	// with a "move to register" or "extload into register" instruction, then
	// permute it into place, if the idx is a constant and if the idx is
	// supported by the target.
	EVT VT = Tmp1.getValueType();
	EVT EltVT = VT.getVectorElementType();
	SDValue StackPtr = DAG.CreateStackTemporary(VT);

	int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

	// Store the vector.
	SDValue Ch = DAG.getStore(
	DAG.getEntryNode(), dl, Tmp1, StackPtr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));

	SDValue StackPtr2 = TLI.getVectorElementPointer(DAG, StackPtr, VT, Tmp3);

	// Store the scalar value.
	Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2, MachinePointerInfo(), EltVT);
	// Load the updated vector.
	return DAG.getLoad(VT, dl, Ch, StackPtr, MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), SPFI));
	}

	SDValue SelectionDAGLegalize::ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
	SDValue Idx,
	const SDLoc &dl) {
	if (ConstantSDNode *InsertPos = dyn_cast<ConstantSDNode>(Idx)) {
	// SCALAR_TO_VECTOR requires that the type of the value being inserted
	// match the element type of the vector being created, except for
	// integers in which case the inserted value can be over width.
	EVT EltVT = Vec.getValueType().getVectorElementType();
	if (Val.getValueType() == EltVT \|\|
	(EltVT.isInteger() && Val.getValueType().bitsGE(EltVT))) {
	SDValue ScVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	Vec.getValueType(), Val);

	unsigned NumElts = Vec.getValueType().getVectorNumElements();
	// We generate a shuffle of InVec and ScVec, so the shuffle mask
	// should be 0,1,2,3,4,5... with the appropriate element replaced with
	// elt 0 of the RHS.
	SmallVector<int, 8> ShufOps;
	for (unsigned i = 0; i != NumElts; ++i)
	ShufOps.push_back(i != InsertPos->getZExtValue() ? i : NumElts);

	return DAG.getVectorShuffle(Vec.getValueType(), dl, Vec, ScVec, ShufOps);
	}
	}
	return PerformInsertVectorEltInMemory(Vec, Val, Idx, dl);
	}

	SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
	DEBUG(dbgs() << "Optimizing float store operations\n");
	// Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
	// FIXME: We shouldn't do this for TargetConstantFP's.
	// FIXME: move this to the DAG Combiner! Note that we can't regress due
	// to phase ordering between legalized code and the dag combiner. This
	// probably means that we need to integrate dag combiner and legalizer
	// together.
	// We generally can't do this one for long doubles.
	SDValue Chain = ST->getChain();
	SDValue Ptr = ST->getBasePtr();
	unsigned Alignment = ST->getAlignment();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();
	SDLoc dl(ST);
	if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
	if (CFP->getValueType(0) == MVT::f32 &&
	TLI.isTypeLegal(MVT::i32)) {
	SDValue Con = DAG.getConstant(CFP->getValueAPF().
	bitcastToAPInt().zextOrTrunc(32),
	SDLoc(CFP), MVT::i32);
	return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), Alignment,
	MMOFlags, AAInfo);
	}

	if (CFP->getValueType(0) == MVT::f64) {
	// If this target supports 64-bit registers, do a single 64-bit store.
	if (TLI.isTypeLegal(MVT::i64)) {
	SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
	zextOrTrunc(64), SDLoc(CFP), MVT::i64);
	return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
	Alignment, MMOFlags, AAInfo);
	}

	if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) {
	// Otherwise, if the target supports 32-bit registers, use 2 32-bit
	// stores. If the target supports neither 32- nor 64-bits, this
	// xform is certainly not worth it.
	const APInt &IntVal = CFP->getValueAPF().bitcastToAPInt();
	SDValue Lo = DAG.getConstant(IntVal.trunc(32), dl, MVT::i32);
	SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), dl, MVT::i32);
	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);

	Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), Alignment,
	MMOFlags, AAInfo);
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
	DAG.getConstant(4, dl, Ptr.getValueType()));
	Hi = DAG.getStore(Chain, dl, Hi, Ptr,
	ST->getPointerInfo().getWithOffset(4),
	MinAlign(Alignment, 4U), MMOFlags, AAInfo);

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
	}
	}
	}
	return SDValue(nullptr, 0);
	}

	void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
	StoreSDNode *ST = cast<StoreSDNode>(Node);
	SDValue Chain = ST->getChain();
	SDValue Ptr = ST->getBasePtr();
	SDLoc dl(Node);

	unsigned Alignment = ST->getAlignment();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();

	if (!ST->isTruncatingStore()) {
	DEBUG(dbgs() << "Legalizing store operation\n");
	if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
	ReplaceNode(ST, OptStore);
	return;
	}

	SDValue Value = ST->getValue();
	MVT VT = Value.getSimpleValueType();
	switch (TLI.getOperationAction(ISD::STORE, VT)) {
	default: llvm_unreachable("This action is not supported yet!");
	case TargetLowering::Legal: {
	// If this is an unaligned store and the target doesn't support it,
	// expand it.
	EVT MemVT = ST->getMemoryVT();
	unsigned AS = ST->getAddressSpace();
	unsigned Align = ST->getAlignment();
	const DataLayout &DL = DAG.getDataLayout();
	if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
	DEBUG(dbgs() << "Expanding unsupported unaligned store\n");
	SDValue Result = TLI.expandUnalignedStore(ST, DAG);
	ReplaceNode(SDValue(ST, 0), Result);
	} else
	DEBUG(dbgs() << "Legal store\n");
	break;
	}
	case TargetLowering::Custom: {
	DEBUG(dbgs() << "Trying custom lowering\n");
	SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
	if (Res && Res != SDValue(Node, 0))
	ReplaceNode(SDValue(Node, 0), Res);
	return;
	}
	case TargetLowering::Promote: {
	MVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT);
	assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
	"Can only promote stores to same size type");
	Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value);
	SDValue Result =
	DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
	Alignment, MMOFlags, AAInfo);
	ReplaceNode(SDValue(Node, 0), Result);
	break;
	}
	}
	return;
	}

	DEBUG(dbgs() << "Legalizing truncating store operations\n");
	SDValue Value = ST->getValue();
	EVT StVT = ST->getMemoryVT();
	unsigned StWidth = StVT.getSizeInBits();
	auto &DL = DAG.getDataLayout();

	if (StWidth != StVT.getStoreSizeInBits()) {
	// Promote to a byte-sized store with upper bits zero if not
	// storing an integral number of bytes. For example, promote
	// TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
	EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
	StVT.getStoreSizeInBits());
	Value = DAG.getZeroExtendInReg(Value, dl, StVT);
	SDValue Result =
	DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), NVT,
	Alignment, MMOFlags, AAInfo);
	ReplaceNode(SDValue(Node, 0), Result);
	} else if (StWidth & (StWidth - 1)) {
	// If not storing a power-of-2 number of bits, expand as two stores.
	assert(!StVT.isVector() && "Unsupported truncstore!");
	unsigned RoundWidth = 1 << Log2_32(StWidth);
	assert(RoundWidth < StWidth);
	unsigned ExtraWidth = StWidth - RoundWidth;
	assert(ExtraWidth < RoundWidth);
	assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
	"Store size not an integral number of bytes!");
	EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
	EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
	SDValue Lo, Hi;
	unsigned IncrementSize;

	if (DL.isLittleEndian()) {
	// TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
	// Store the bottom RoundWidth bits.
	Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
	RoundVT, Alignment, MMOFlags, AAInfo);

	// Store the remaining ExtraWidth bits.
	IncrementSize = RoundWidth / 8;
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
	DAG.getConstant(IncrementSize, dl,
	Ptr.getValueType()));
	Hi = DAG.getNode(
	ISD::SRL, dl, Value.getValueType(), Value,
	DAG.getConstant(RoundWidth, dl,
	TLI.getShiftAmountTy(Value.getValueType(), DL)));
	Hi = DAG.getTruncStore(
	Chain, dl, Hi, Ptr,
	ST->getPointerInfo().getWithOffset(IncrementSize), ExtraVT,
	MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
	} else {
	// Big endian - avoid unaligned stores.
	// TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
	// Store the top RoundWidth bits.
	Hi = DAG.getNode(
	ISD::SRL, dl, Value.getValueType(), Value,
	DAG.getConstant(ExtraWidth, dl,
	TLI.getShiftAmountTy(Value.getValueType(), DL)));
	Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
	RoundVT, Alignment, MMOFlags, AAInfo);

	// Store the remaining ExtraWidth bits.
	IncrementSize = RoundWidth / 8;
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
	DAG.getConstant(IncrementSize, dl,
	Ptr.getValueType()));
	Lo = DAG.getTruncStore(
	Chain, dl, Value, Ptr,
	ST->getPointerInfo().getWithOffset(IncrementSize), ExtraVT,
	MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
	}

	// The order of the stores doesn't matter.
	SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
	ReplaceNode(SDValue(Node, 0), Result);
	} else {
	switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
	default: llvm_unreachable("This action is not supported yet!");
	case TargetLowering::Legal: {
	EVT MemVT = ST->getMemoryVT();
	unsigned AS = ST->getAddressSpace();
	unsigned Align = ST->getAlignment();
	// If this is an unaligned store and the target doesn't support it,
	// expand it.
	if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
	SDValue Result = TLI.expandUnalignedStore(ST, DAG);
	ReplaceNode(SDValue(ST, 0), Result);
	}
	break;
	}
	case TargetLowering::Custom: {
	SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
	if (Res && Res != SDValue(Node, 0))
	ReplaceNode(SDValue(Node, 0), Res);
	return;
	}
	case TargetLowering::Expand:
	assert(!StVT.isVector() &&
	"Vector Stores are handled in LegalizeVectorOps");

	SDValue Result;

	// TRUNCSTORE:i16 i32 -> STORE i16
	if (TLI.isTypeLegal(StVT)) {
	Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
	Result = DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
	Alignment, MMOFlags, AAInfo);
	} else {
	// The in-memory type isn't legal. Truncate to the type it would promote
	// to, and then do a truncstore.
	Value = DAG.getNode(ISD::TRUNCATE, dl,
	TLI.getTypeToTransformTo(*DAG.getContext(), StVT),
	Value);
	Result = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
	StVT, Alignment, MMOFlags, AAInfo);
	}

	ReplaceNode(SDValue(Node, 0), Result);
	break;
	}
	}
	}

	void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
	LoadSDNode *LD = cast<LoadSDNode>(Node);
	SDValue Chain = LD->getChain(); // The chain.
	SDValue Ptr = LD->getBasePtr(); // The base pointer.
	SDValue Value; // The value returned by the load op.
	SDLoc dl(Node);

	ISD::LoadExtType ExtType = LD->getExtensionType();
	if (ExtType == ISD::NON_EXTLOAD) {
	DEBUG(dbgs() << "Legalizing non-extending load operation\n");
	MVT VT = Node->getSimpleValueType(0);
	SDValue RVal = SDValue(Node, 0);
	SDValue RChain = SDValue(Node, 1);

	switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
	default: llvm_unreachable("This action is not supported yet!");
	case TargetLowering::Legal: {
	EVT MemVT = LD->getMemoryVT();
	unsigned AS = LD->getAddressSpace();
	unsigned Align = LD->getAlignment();
	const DataLayout &DL = DAG.getDataLayout();
	// If this is an unaligned load and the target doesn't support it,
	// expand it.
	if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
	std::tie(RVal, RChain) = TLI.expandUnalignedLoad(LD, DAG);
	}
	break;
	}
	case TargetLowering::Custom:
	if (SDValue Res = TLI.LowerOperation(RVal, DAG)) {
	RVal = Res;
	RChain = Res.getValue(1);
	}
	break;

	case TargetLowering::Promote: {
	MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
	assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
	"Can only promote loads to same size type");

	SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getMemOperand());
	RVal = DAG.getNode(ISD::BITCAST, dl, VT, Res);
	RChain = Res.getValue(1);
	break;
	}
	}
	if (RChain.getNode() != Node) {
	assert(RVal.getNode() != Node && "Load must be completely replaced");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), RVal);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), RChain);
	if (UpdatedNodes) {
	UpdatedNodes->insert(RVal.getNode());
	UpdatedNodes->insert(RChain.getNode());
	}
	ReplacedNode(Node);
	}
	return;
	}

	DEBUG(dbgs() << "Legalizing extending load operation\n");
	EVT SrcVT = LD->getMemoryVT();
	unsigned SrcWidth = SrcVT.getSizeInBits();
	unsigned Alignment = LD->getAlignment();
	MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
	AAMDNodes AAInfo = LD->getAAInfo();

	if (SrcWidth != SrcVT.getStoreSizeInBits() &&
	// Some targets pretend to have an i1 loading operation, and actually
	// load an i8. This trick is correct for ZEXTLOAD because the top 7
	// bits are guaranteed to be zero; it helps the optimizers understand
	// that these bits are zero. It is also useful for EXTLOAD, since it
	// tells the optimizers that those bits are undefined. It would be
	// nice to have an effective generic way of getting these benefits...
	// Until such a way is found, don't insist on promoting i1 here.
	(SrcVT != MVT::i1 \|\|
	TLI.getLoadExtAction(ExtType, Node->getValueType(0), MVT::i1) ==
	TargetLowering::Promote)) {
	// Promote to a byte-sized load if not loading an integral number of
	// bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
	unsigned NewWidth = SrcVT.getStoreSizeInBits();
	EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth);
	SDValue Ch;

	// The extra bits are guaranteed to be zero, since we stored them that
	// way. A zext load from NVT thus automatically gives zext from SrcVT.

	ISD::LoadExtType NewExtType =
	ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;

	SDValue Result =
	DAG.getExtLoad(NewExtType, dl, Node->getValueType(0), Chain, Ptr,
	LD->getPointerInfo(), NVT, Alignment, MMOFlags, AAInfo);

	Ch = Result.getValue(1); // The chain.

	if (ExtType == ISD::SEXTLOAD)
	// Having the top bits zero doesn't help when sign extending.
	Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
	Result.getValueType(),
	Result, DAG.getValueType(SrcVT));
	else if (ExtType == ISD::ZEXTLOAD \|\| NVT == Result.getValueType())
	// All the top bits are guaranteed to be zero - inform the optimizers.
	Result = DAG.getNode(ISD::AssertZext, dl,
	Result.getValueType(), Result,
	DAG.getValueType(SrcVT));

	Value = Result;
	Chain = Ch;
	} else if (SrcWidth & (SrcWidth - 1)) {
	// If not loading a power-of-2 number of bits, expand as two loads.
	assert(!SrcVT.isVector() && "Unsupported extload!");
	unsigned RoundWidth = 1 << Log2_32(SrcWidth);
	assert(RoundWidth < SrcWidth);
	unsigned ExtraWidth = SrcWidth - RoundWidth;
	assert(ExtraWidth < RoundWidth);
	assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
	"Load size not an integral number of bytes!");
	EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
	EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
	SDValue Lo, Hi, Ch;
	unsigned IncrementSize;
	auto &DL = DAG.getDataLayout();

	if (DL.isLittleEndian()) {
	// EXTLOAD:i24 -> ZEXTLOAD:i16 \| (shl EXTLOAD@+2:i8, 16)
	// Load the bottom RoundWidth bits.
	Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
	LD->getPointerInfo(), RoundVT, Alignment, MMOFlags,
	AAInfo);

	// Load the remaining ExtraWidth bits.
	IncrementSize = RoundWidth / 8;
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
	DAG.getConstant(IncrementSize, dl,
	Ptr.getValueType()));
	Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
	LD->getPointerInfo().getWithOffset(IncrementSize),
	ExtraVT, MinAlign(Alignment, IncrementSize), MMOFlags,
	AAInfo);

	// Build a factor node to remember that this load is independent of
	// the other one.
	Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Move the top bits to the right place.
	Hi = DAG.getNode(
	ISD::SHL, dl, Hi.getValueType(), Hi,
	DAG.getConstant(RoundWidth, dl,
	TLI.getShiftAmountTy(Hi.getValueType(), DL)));

	// Join the hi and lo parts.
	Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
	} else {
	// Big endian - avoid unaligned loads.
	// EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) \| ZEXTLOAD@+2:i8
	// Load the top RoundWidth bits.
	Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
	LD->getPointerInfo(), RoundVT, Alignment, MMOFlags,
	AAInfo);

	// Load the remaining ExtraWidth bits.
	IncrementSize = RoundWidth / 8;
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
	DAG.getConstant(IncrementSize, dl,
	Ptr.getValueType()));
	Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
	LD->getPointerInfo().getWithOffset(IncrementSize),
	ExtraVT, MinAlign(Alignment, IncrementSize), MMOFlags,
	AAInfo);

	// Build a factor node to remember that this load is independent of
	// the other one.
	Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Move the top bits to the right place.
	Hi = DAG.getNode(
	ISD::SHL, dl, Hi.getValueType(), Hi,
	DAG.getConstant(ExtraWidth, dl,
	TLI.getShiftAmountTy(Hi.getValueType(), DL)));

	// Join the hi and lo parts.
	Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
	}

	Chain = Ch;
	} else {
	bool isCustom = false;
	switch (TLI.getLoadExtAction(ExtType, Node->getValueType(0),
	SrcVT.getSimpleVT())) {
	default: llvm_unreachable("This action is not supported yet!");
	case TargetLowering::Custom:
	isCustom = true;
	LLVM_FALLTHROUGH;
	case TargetLowering::Legal:
	Value = SDValue(Node, 0);
	Chain = SDValue(Node, 1);

	if (isCustom) {
	if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {
	Value = Res;
	Chain = Res.getValue(1);
	}
	} else {
	// If this is an unaligned load and the target doesn't support it,
	// expand it.
	EVT MemVT = LD->getMemoryVT();
	unsigned AS = LD->getAddressSpace();
	unsigned Align = LD->getAlignment();
	const DataLayout &DL = DAG.getDataLayout();
	if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
	std::tie(Value, Chain) = TLI.expandUnalignedLoad(LD, DAG);
	}
	}
	break;

	case TargetLowering::Expand: {
	EVT DestVT = Node->getValueType(0);
	if (!TLI.isLoadExtLegal(ISD::EXTLOAD, DestVT, SrcVT)) {
	// If the source type is not legal, see if there is a legal extload to
	// an intermediate type that we can then extend further.
	EVT LoadVT = TLI.getRegisterType(SrcVT.getSimpleVT());
	if (TLI.isTypeLegal(SrcVT) \|\| // Same as SrcVT == LoadVT?
	TLI.isLoadExtLegal(ExtType, LoadVT, SrcVT)) {
	// If we are loading a legal type, this is a non-extload followed by a
	// full extend.
	ISD::LoadExtType MidExtType =
	(LoadVT == SrcVT) ? ISD::NON_EXTLOAD : ExtType;

	SDValue Load = DAG.getExtLoad(MidExtType, dl, LoadVT, Chain, Ptr,
	SrcVT, LD->getMemOperand());
	unsigned ExtendOp =
	ISD::getExtForLoadExtType(SrcVT.isFloatingPoint(), ExtType);
	Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
	Chain = Load.getValue(1);
	break;
	}

	// Handle the special case of fp16 extloads. EXTLOAD doesn't have the
	// normal undefined upper bits behavior to allow using an in-reg extend
	// with the illegal FP type, so load as an integer and do the
	// from-integer conversion.
	if (SrcVT.getScalarType() == MVT::f16) {
	EVT ISrcVT = SrcVT.changeTypeToInteger();
	EVT IDestVT = DestVT.changeTypeToInteger();
	EVT LoadVT = TLI.getRegisterType(IDestVT.getSimpleVT());

	SDValue Result = DAG.getExtLoad(ISD::ZEXTLOAD, dl, LoadVT,
	Chain, Ptr, ISrcVT,
	LD->getMemOperand());
	Value = DAG.getNode(ISD::FP16_TO_FP, dl, DestVT, Result);
	Chain = Result.getValue(1);
	break;
	}
	}

	assert(!SrcVT.isVector() &&
	"Vector Loads are handled in LegalizeVectorOps");

	// FIXME: This does not work for vectors on most targets. Sign-
	// and zero-extend operations are currently folded into extending
	// loads, whether they are legal or not, and then we end up here
	// without any support for legalizing them.
	assert(ExtType != ISD::EXTLOAD &&
	"EXTLOAD should always be supported!");
	// Turn the unsupported load into an EXTLOAD followed by an
	// explicit zero/sign extend inreg.
	SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl,
	Node->getValueType(0),
	Chain, Ptr, SrcVT,
	LD->getMemOperand());
	SDValue ValRes;
	if (ExtType == ISD::SEXTLOAD)
	ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
	Result.getValueType(),
	Result, DAG.getValueType(SrcVT));
	else
	ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
	Value = ValRes;
	Chain = Result.getValue(1);
	break;
	}
	}
	}

	// Since loads produce two values, make sure to remember that we legalized
	// both of them.
	if (Chain.getNode() != Node) {
	assert(Value.getNode() != Node && "Load must be completely replaced");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Value);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
	if (UpdatedNodes) {
	UpdatedNodes->insert(Value.getNode());
	UpdatedNodes->insert(Chain.getNode());
	}
	ReplacedNode(Node);
	}
	}

	static TargetLowering::LegalizeAction
	getStrictFPOpcodeAction(const TargetLowering &TLI, unsigned Opcode, EVT VT) {
	unsigned EqOpc;
	switch (Opcode) {
	default: llvm_unreachable("Unexpected FP pseudo-opcode");
	case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break;
	case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break;
	case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break;
	case ISD::STRICT_FMA: EqOpc = ISD::FMA; break;
	case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break;
	case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break;
	case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break;
	case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break;
	case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break;
	case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break;
	case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break;
	case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break;
	case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break;
	}

	auto Action = TLI.getOperationAction(EqOpc, VT);

	// We don't currently handle Custom or Promote for strict FP pseudo-ops.
	// For now, we just expand for those cases.
	if (Action != TargetLowering::Legal)
	Action = TargetLowering::Expand;

	return Action;
	}

	/// Return a legal replacement for the given operation, with all legal operands.
	void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
	DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG));

	// Allow illegal target nodes and illegal registers.
	if (Node->getOpcode() == ISD::TargetConstant \|\|
	Node->getOpcode() == ISD::Register)
	return;

	#ifndef NDEBUG
	for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
	assert((TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
	TargetLowering::TypeLegal \|\|
	TLI.isTypeLegal(Node->getValueType(i))) &&
	"Unexpected illegal type!");

	for (const SDValue &Op : Node->op_values())
	assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) ==
	TargetLowering::TypeLegal \|\|
	TLI.isTypeLegal(Op.getValueType()) \|\|
	Op.getOpcode() == ISD::TargetConstant \|\|
	Op.getOpcode() == ISD::Register) &&
	"Unexpected illegal type!");
	#endif

	// Figure out the correct action; the way to query this varies by opcode
	TargetLowering::LegalizeAction Action = TargetLowering::Legal;
	bool SimpleFinishLegalizing = true;
	switch (Node->getOpcode()) {
	case ISD::INTRINSIC_W_CHAIN:
	case ISD::INTRINSIC_WO_CHAIN:
	case ISD::INTRINSIC_VOID:
	case ISD::STACKSAVE:
	Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
	break;
	case ISD::GET_DYNAMIC_AREA_OFFSET:
	Action = TLI.getOperationAction(Node->getOpcode(),
	Node->getValueType(0));
	break;
	case ISD::VAARG:
	Action = TLI.getOperationAction(Node->getOpcode(),
	Node->getValueType(0));
	if (Action != TargetLowering::Promote)
	Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
	break;
	case ISD::FP_TO_FP16:
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	case ISD::EXTRACT_VECTOR_ELT:
	Action = TLI.getOperationAction(Node->getOpcode(),
	Node->getOperand(0).getValueType());
	break;
	case ISD::FP_ROUND_INREG:
	case ISD::SIGN_EXTEND_INREG: {
	EVT InnerType = cast<VTSDNode>(Node->getOperand(1))->getVT();
	Action = TLI.getOperationAction(Node->getOpcode(), InnerType);
	break;
	}
	case ISD::ATOMIC_STORE:
	Action = TLI.getOperationAction(Node->getOpcode(),
	Node->getOperand(2).getValueType());
	break;
	case ISD::SELECT_CC:
	case ISD::SETCC:
	case ISD::BR_CC: {
	unsigned CCOperand = Node->getOpcode() == ISD::SELECT_CC ? 4 :
	Node->getOpcode() == ISD::SETCC ? 2 :
	Node->getOpcode() == ISD::SETCCE ? 3 : 1;
	unsigned CompareOperand = Node->getOpcode() == ISD::BR_CC ? 2 : 0;
	MVT OpVT = Node->getOperand(CompareOperand).getSimpleValueType();
	ISD::CondCode CCCode =
	cast<CondCodeSDNode>(Node->getOperand(CCOperand))->get();
	Action = TLI.getCondCodeAction(CCCode, OpVT);
	if (Action == TargetLowering::Legal) {
	if (Node->getOpcode() == ISD::SELECT_CC)
	Action = TLI.getOperationAction(Node->getOpcode(),
	Node->getValueType(0));
	else
	Action = TLI.getOperationAction(Node->getOpcode(), OpVT);
	}
	break;
	}
	case ISD::LOAD:
	case ISD::STORE:
	// FIXME: Model these properly. LOAD and STORE are complicated, and
	// STORE expects the unlegalized operand in some cases.
	SimpleFinishLegalizing = false;
	break;
	case ISD::CALLSEQ_START:
	case ISD::CALLSEQ_END:
	// FIXME: This shouldn't be necessary. These nodes have special properties
	// dealing with the recursive nature of legalization. Removing this
	// special case should be done as part of making LegalizeDAG non-recursive.
	SimpleFinishLegalizing = false;
	break;
	case ISD::EXTRACT_ELEMENT:
	case ISD::FLT_ROUNDS_:
	case ISD::MERGE_VALUES:
	case ISD::EH_RETURN:
	case ISD::FRAME_TO_ARGS_OFFSET:
	case ISD::EH_DWARF_CFA:
	case ISD::EH_SJLJ_SETJMP:
	case ISD::EH_SJLJ_LONGJMP:
	case ISD::EH_SJLJ_SETUP_DISPATCH:
	// These operations lie about being legal: when they claim to be legal,
	// they should actually be expanded.
	Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
	if (Action == TargetLowering::Legal)
	Action = TargetLowering::Expand;
	break;
	case ISD::INIT_TRAMPOLINE:
	case ISD::ADJUST_TRAMPOLINE:
	case ISD::FRAMEADDR:
	case ISD::RETURNADDR:
	case ISD::ADDROFRETURNADDR:
	// These operations lie about being legal: when they claim to be legal,
	// they should actually be custom-lowered.
	Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
	if (Action == TargetLowering::Legal)
	Action = TargetLowering::Custom;
	break;
	case ISD::READCYCLECOUNTER:
	// READCYCLECOUNTER returns an i64, even if type legalization might have
	// expanded that to several smaller types.
	Action = TLI.getOperationAction(Node->getOpcode(), MVT::i64);
	break;
	case ISD::READ_REGISTER:
	case ISD::WRITE_REGISTER:
	// Named register is legal in the DAG, but blocked by register name
	// selection if not implemented by target (to chose the correct register)
	// They'll be converted to Copy(To/From)Reg.
	Action = TargetLowering::Legal;
	break;
	case ISD::DEBUGTRAP:
	Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
	if (Action == TargetLowering::Expand) {
	// replace ISD::DEBUGTRAP with ISD::TRAP
	SDValue NewVal;
	NewVal = DAG.getNode(ISD::TRAP, SDLoc(Node), Node->getVTList(),
	Node->getOperand(0));
	ReplaceNode(Node, NewVal.getNode());
	LegalizeOp(NewVal.getNode());
	return;
	}
	break;
	case ISD::STRICT_FSQRT:
	case ISD::STRICT_FMA:
	case ISD::STRICT_FPOW:
	case ISD::STRICT_FPOWI:
	case ISD::STRICT_FSIN:
	case ISD::STRICT_FCOS:
	case ISD::STRICT_FEXP:
	case ISD::STRICT_FEXP2:
	case ISD::STRICT_FLOG:
	case ISD::STRICT_FLOG10:
	case ISD::STRICT_FLOG2:
	case ISD::STRICT_FRINT:
	case ISD::STRICT_FNEARBYINT:
	// These pseudo-ops get legalized as if they were their non-strict
	// equivalent. For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
	// is also legal, but if ISD::FSQRT requires expansion then so does
	// ISD::STRICT_FSQRT.
	Action = getStrictFPOpcodeAction(TLI, Node->getOpcode(),
	Node->getValueType(0));
	break;
	default:
	if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
	Action = TargetLowering::Legal;
	} else {
	Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
	}
	break;
	}

	if (SimpleFinishLegalizing) {
	SDNode *NewNode = Node;
	switch (Node->getOpcode()) {
	default: break;
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SRA:
	case ISD::ROTL:
	case ISD::ROTR: {
	// Legalizing shifts/rotates requires adjusting the shift amount
	// to the appropriate width.
	SDValue Op0 = Node->getOperand(0);
	SDValue Op1 = Node->getOperand(1);
	if (!Op1.getValueType().isVector()) {
	SDValue SAO = DAG.getShiftAmountOperand(Op0.getValueType(), Op1);
	// The getShiftAmountOperand() may create a new operand node or
	// return the existing one. If new operand is created we need
	// to update the parent node.
	// Do not try to legalize SAO here! It will be automatically legalized
	// in the next round.
	if (SAO != Op1)
	NewNode = DAG.UpdateNodeOperands(Node, Op0, SAO);
	}
	}
	break;
	case ISD::SRL_PARTS:
	case ISD::SRA_PARTS:
	case ISD::SHL_PARTS: {
	// Legalizing shifts/rotates requires adjusting the shift amount
	// to the appropriate width.
	SDValue Op0 = Node->getOperand(0);
	SDValue Op1 = Node->getOperand(1);
	SDValue Op2 = Node->getOperand(2);
	if (!Op2.getValueType().isVector()) {
	SDValue SAO = DAG.getShiftAmountOperand(Op0.getValueType(), Op2);
	// The getShiftAmountOperand() may create a new operand node or
	// return the existing one. If new operand is created we need
	// to update the parent node.
	if (SAO != Op2)
	NewNode = DAG.UpdateNodeOperands(Node, Op0, Op1, SAO);
	}
	break;
	}
	}

	if (NewNode != Node) {
	ReplaceNode(Node, NewNode);
	Node = NewNode;
	}
	switch (Action) {
	case TargetLowering::Legal:
	DEBUG(dbgs() << "Legal node: nothing to do\n");
	return;
	case TargetLowering::Custom:
	DEBUG(dbgs() << "Trying custom legalization\n");
	// FIXME: The handling for custom lowering with multiple results is
	// a complete mess.
	if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {
	if (!(Res.getNode() != Node \|\| Res.getResNo() != 0))
	return;

	if (Node->getNumValues() == 1) {
	DEBUG(dbgs() << "Successfully custom legalized node\n");
	// We can just directly replace this node with the lowered value.
	ReplaceNode(SDValue(Node, 0), Res);
	return;
	}

	SmallVector<SDValue, 8> ResultVals;
	for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
	ResultVals.push_back(Res.getValue(i));
	DEBUG(dbgs() << "Successfully custom legalized node\n");
	ReplaceNode(Node, ResultVals.data());
	return;
	}
	DEBUG(dbgs() << "Could not custom legalize node\n");
	LLVM_FALLTHROUGH;
	case TargetLowering::Expand:
	if (ExpandNode(Node))
	return;
	LLVM_FALLTHROUGH;
	case TargetLowering::LibCall:
	ConvertNodeToLibcall(Node);
	return;
	case TargetLowering::Promote:
	PromoteNode(Node);
	return;
	}
	}

	switch (Node->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "NODE: ";
	Node->dump( &DAG);
	dbgs() << "\n";
	#endif
	llvm_unreachable("Do not know how to legalize this operator!");

	case ISD::CALLSEQ_START:
	case ISD::CALLSEQ_END:
	break;
	case ISD::LOAD:
	return LegalizeLoadOps(Node);
	case ISD::STORE:
	return LegalizeStoreOps(Node);
	}
	}

	SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
	SDValue Vec = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);
	SDLoc dl(Op);

	// Before we generate a new store to a temporary stack slot, see if there is
	// already one that we can use. There often is because when we scalarize
	// vector operations (using SelectionDAG::UnrollVectorOp for example) a whole
	// series of EXTRACT_VECTOR_ELT nodes are generated, one for each element in
	// the vector. If all are expanded here, we don't want one store per vector
	// element.

	// Caches for hasPredecessorHelper
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Worklist.push_back(Idx.getNode());
	SDValue StackPtr, Ch;
	for (SDNode::use_iterator UI = Vec.getNode()->use_begin(),
	UE = Vec.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (StoreSDNode *ST = dyn_cast<StoreSDNode>(User)) {
	if (ST->isIndexed() \|\| ST->isTruncatingStore() \|\|
	ST->getValue() != Vec)
	continue;

	// Make sure that nothing else could have stored into the destination of
	// this store.
	if (!ST->getChain().reachesChainWithoutSideEffects(DAG.getEntryNode()))
	continue;

	// If the index is dependent on the store we will introduce a cycle when
	// creating the load (the load uses the index, and by replacing the chain
	// we will make the index dependent on the load). Also, the store might be
	// dependent on the extractelement and introduce a cycle when creating
	// the load.
	if (SDNode::hasPredecessorHelper(ST, Visited, Worklist) \|\|
	ST->hasPredecessor(Op.getNode()))
	continue;

	StackPtr = ST->getBasePtr();
	Ch = SDValue(ST, 0);
	break;
	}
	}

	EVT VecVT = Vec.getValueType();

	if (!Ch.getNode()) {
	// Store the value to a temporary stack slot, then LOAD the returned part.
	StackPtr = DAG.CreateStackTemporary(VecVT);
	Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
	MachinePointerInfo());
	}

	StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);

	SDValue NewLoad;

	if (Op.getValueType().isVector())
	NewLoad =
	DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, MachinePointerInfo());
	else
	NewLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
	MachinePointerInfo(),
	VecVT.getVectorElementType());

	// Replace the chain going out of the store, by the one out of the load.
	DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1));

	// We introduced a cycle though, so update the loads operands, making sure
	// to use the original store's chain as an incoming chain.
	SmallVector<SDValue, 6> NewLoadOperands(NewLoad->op_begin(),
	NewLoad->op_end());
	NewLoadOperands[0] = Ch;
	NewLoad =
	SDValue(DAG.UpdateNodeOperands(NewLoad.getNode(), NewLoadOperands), 0);
	return NewLoad;
	}

	SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
	assert(Op.getValueType().isVector() && "Non-vector insert subvector!");

	SDValue Vec = Op.getOperand(0);
	SDValue Part = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	SDLoc dl(Op);

	// Store the value to a temporary stack slot, then LOAD the returned part.
	EVT VecVT = Vec.getValueType();
	SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
	int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);

	// First store the whole vector.
	SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);

	// Then store the inserted part.
	SDValue SubStackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);

	// Store the subvector.
	Ch = DAG.getStore(Ch, dl, Part, SubStackPtr, MachinePointerInfo());

	// Finally, load the updated vector.
	return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, PtrInfo);
	}

	SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
	// We can't handle this case efficiently. Allocate a sufficiently
	// aligned object on the stack, store each element into it, then load
	// the result as a vector.
	// Create the stack frame object.
	EVT VT = Node->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDLoc dl(Node);
	SDValue FIPtr = DAG.CreateStackTemporary(VT);
	int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);

	// Emit a store of each element to the stack slot.
	SmallVector<SDValue, 8> Stores;
	unsigned TypeByteSize = EltVT.getSizeInBits() / 8;
	// Store (in the right endianness) the elements to memory.
	for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
	// Ignore undef elements.
	if (Node->getOperand(i).isUndef()) continue;

	unsigned Offset = TypeByteSize*i;

	SDValue Idx = DAG.getConstant(Offset, dl, FIPtr.getValueType());
	Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx);

	// If the destination vector element type is narrower than the source
	// element type, only store the bits necessary.
	if (EltVT.bitsLT(Node->getOperand(i).getValueType().getScalarType())) {
	Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
	Node->getOperand(i), Idx,
	PtrInfo.getWithOffset(Offset), EltVT));
	} else
	Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, Node->getOperand(i),
	Idx, PtrInfo.getWithOffset(Offset)));
	}

	SDValue StoreChain;
	if (!Stores.empty()) // Not all undef elements?
	StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
	else
	StoreChain = DAG.getEntryNode();

	// Result is a load from the stack slot.
	return DAG.getLoad(VT, dl, StoreChain, FIPtr, PtrInfo);
	}

	/// Bitcast a floating-point value to an integer value. Only bitcast the part
	/// containing the sign bit if the target has no integer value capable of
	/// holding all bits of the floating-point value.
	void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
	const SDLoc &DL,
	SDValue Value) const {
	EVT FloatVT = Value.getValueType();
	unsigned NumBits = FloatVT.getSizeInBits();
	State.FloatVT = FloatVT;
	EVT IVT = EVT::getIntegerVT(*DAG.getContext(), NumBits);
	// Convert to an integer of the same size.
	if (TLI.isTypeLegal(IVT)) {
	State.IntValue = DAG.getNode(ISD::BITCAST, DL, IVT, Value);
	State.SignMask = APInt::getSignMask(NumBits);
	State.SignBit = NumBits - 1;
	return;
	}

	auto &DataLayout = DAG.getDataLayout();
	// Store the float to memory, then load the sign part out as an integer.
	MVT LoadTy = TLI.getRegisterType(*DAG.getContext(), MVT::i8);
	// First create a temporary that is aligned for both the load and store.
	SDValue StackPtr = DAG.CreateStackTemporary(FloatVT, LoadTy);
	int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	// Then store the float to it.
	State.FloatPtr = StackPtr;
	MachineFunction &MF = DAG.getMachineFunction();
	State.FloatPointerInfo = MachinePointerInfo::getFixedStack(MF, FI);
	State.Chain = DAG.getStore(DAG.getEntryNode(), DL, Value, State.FloatPtr,
	State.FloatPointerInfo);

	SDValue IntPtr;
	if (DataLayout.isBigEndian()) {
	assert(FloatVT.isByteSized() && "Unsupported floating point type!");
	// Load out a legal integer with the same sign bit as the float.
	IntPtr = StackPtr;
	State.IntPointerInfo = State.FloatPointerInfo;
	} else {
	// Advance the pointer so that the loaded byte will contain the sign bit.
	unsigned ByteOffset = (FloatVT.getSizeInBits() / 8) - 1;
	IntPtr = DAG.getNode(ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
	DAG.getConstant(ByteOffset, DL, StackPtr.getValueType()));
	State.IntPointerInfo = MachinePointerInfo::getFixedStack(MF, FI,
	ByteOffset);
	}

	State.IntPtr = IntPtr;
	State.IntValue = DAG.getExtLoad(ISD::EXTLOAD, DL, LoadTy, State.Chain, IntPtr,
	State.IntPointerInfo, MVT::i8);
	State.SignMask = APInt::getOneBitSet(LoadTy.getSizeInBits(), 7);
	State.SignBit = 7;
	}

	/// Replace the integer value produced by getSignAsIntValue() with a new value
	/// and cast the result back to a floating-point type.
	SDValue SelectionDAGLegalize::modifySignAsInt(const FloatSignAsInt &State,
	const SDLoc &DL,
	SDValue NewIntValue) const {
	if (!State.Chain)
	return DAG.getNode(ISD::BITCAST, DL, State.FloatVT, NewIntValue);

	// Override the part containing the sign bit in the value stored on the stack.
	SDValue Chain = DAG.getTruncStore(State.Chain, DL, NewIntValue, State.IntPtr,
	State.IntPointerInfo, MVT::i8);
	return DAG.getLoad(State.FloatVT, DL, Chain, State.FloatPtr,
	State.FloatPointerInfo);
	}

	SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode *Node) const {
	SDLoc DL(Node);
	SDValue Mag = Node->getOperand(0);
	SDValue Sign = Node->getOperand(1);

	// Get sign bit into an integer value.
	FloatSignAsInt SignAsInt;
	getSignAsIntValue(SignAsInt, DL, Sign);

	EVT IntVT = SignAsInt.IntValue.getValueType();
	SDValue SignMask = DAG.getConstant(SignAsInt.SignMask, DL, IntVT);
	SDValue SignBit = DAG.getNode(ISD::AND, DL, IntVT, SignAsInt.IntValue,
	SignMask);

	// If FABS is legal transform FCOPYSIGN(x, y) => sign(x) ? -FABS(x) : FABS(X)
	EVT FloatVT = Mag.getValueType();
	if (TLI.isOperationLegalOrCustom(ISD::FABS, FloatVT) &&
	TLI.isOperationLegalOrCustom(ISD::FNEG, FloatVT)) {
	SDValue AbsValue = DAG.getNode(ISD::FABS, DL, FloatVT, Mag);
	SDValue NegValue = DAG.getNode(ISD::FNEG, DL, FloatVT, AbsValue);
	SDValue Cond = DAG.getSetCC(DL, getSetCCResultType(IntVT), SignBit,
	DAG.getConstant(0, DL, IntVT), ISD::SETNE);
	return DAG.getSelect(DL, FloatVT, Cond, NegValue, AbsValue);
	}

	// Transform Mag value to integer, and clear the sign bit.
	FloatSignAsInt MagAsInt;
	getSignAsIntValue(MagAsInt, DL, Mag);
	EVT MagVT = MagAsInt.IntValue.getValueType();
	SDValue ClearSignMask = DAG.getConstant(~MagAsInt.SignMask, DL, MagVT);
	SDValue ClearedSign = DAG.getNode(ISD::AND, DL, MagVT, MagAsInt.IntValue,
	ClearSignMask);

	// Get the signbit at the right position for MagAsInt.
	int ShiftAmount = SignAsInt.SignBit - MagAsInt.SignBit;
	if (SignBit.getValueSizeInBits() > ClearedSign.getValueSizeInBits()) {
	if (ShiftAmount > 0) {
	SDValue ShiftCnst = DAG.getConstant(ShiftAmount, DL, IntVT);
	SignBit = DAG.getNode(ISD::SRL, DL, IntVT, SignBit, ShiftCnst);
	} else if (ShiftAmount < 0) {
	SDValue ShiftCnst = DAG.getConstant(-ShiftAmount, DL, IntVT);
	SignBit = DAG.getNode(ISD::SHL, DL, IntVT, SignBit, ShiftCnst);
	}
	SignBit = DAG.getNode(ISD::TRUNCATE, DL, MagVT, SignBit);
	} else if (SignBit.getValueSizeInBits() < ClearedSign.getValueSizeInBits()) {
	SignBit = DAG.getNode(ISD::ZERO_EXTEND, DL, MagVT, SignBit);
	if (ShiftAmount > 0) {
	SDValue ShiftCnst = DAG.getConstant(ShiftAmount, DL, MagVT);
	SignBit = DAG.getNode(ISD::SRL, DL, MagVT, SignBit, ShiftCnst);
	} else if (ShiftAmount < 0) {
	SDValue ShiftCnst = DAG.getConstant(-ShiftAmount, DL, MagVT);
	SignBit = DAG.getNode(ISD::SHL, DL, MagVT, SignBit, ShiftCnst);
	}
	}

	// Store the part with the modified sign and convert back to float.
	SDValue CopiedSign = DAG.getNode(ISD::OR, DL, MagVT, ClearedSign, SignBit);
	return modifySignAsInt(MagAsInt, DL, CopiedSign);
	}

	SDValue SelectionDAGLegalize::ExpandFABS(SDNode *Node) const {
	SDLoc DL(Node);
	SDValue Value = Node->getOperand(0);

	// Transform FABS(x) => FCOPYSIGN(x, 0.0) if FCOPYSIGN is legal.
	EVT FloatVT = Value.getValueType();
	if (TLI.isOperationLegalOrCustom(ISD::FCOPYSIGN, FloatVT)) {
	SDValue Zero = DAG.getConstantFP(0.0, DL, FloatVT);
	return DAG.getNode(ISD::FCOPYSIGN, DL, FloatVT, Value, Zero);
	}

	// Transform value to integer, clear the sign bit and transform back.
	FloatSignAsInt ValueAsInt;
	getSignAsIntValue(ValueAsInt, DL, Value);
	EVT IntVT = ValueAsInt.IntValue.getValueType();
	SDValue ClearSignMask = DAG.getConstant(~ValueAsInt.SignMask, DL, IntVT);
	SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, ValueAsInt.IntValue,
	ClearSignMask);
	return modifySignAsInt(ValueAsInt, DL, ClearedSign);
	}

	void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
	SmallVectorImpl<SDValue> &Results) {
	unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
	assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
	" not tell us which reg is the stack pointer!");
	SDLoc dl(Node);
	EVT VT = Node->getValueType(0);
	SDValue Tmp1 = SDValue(Node, 0);
	SDValue Tmp2 = SDValue(Node, 1);
	SDValue Tmp3 = Node->getOperand(2);
	SDValue Chain = Tmp1.getOperand(0);

	// Chain the dynamic stack allocation so that it doesn't modify the stack
	// pointer when other instructions are using the stack.
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	SDValue Size = Tmp2.getOperand(1);
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
	Chain = SP.getValue(1);
	unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
	unsigned StackAlign =
	DAG.getSubtarget().getFrameLowering()->getStackAlignment();
	Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
	if (Align > StackAlign)
	Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain

	Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	Results.push_back(Tmp1);
	Results.push_back(Tmp2);
	}

	/// Legalize a SETCC with given LHS and RHS and condition code CC on the current
	/// target.
	///
	/// If the SETCC has been legalized using AND / OR, then the legalized node
	/// will be stored in LHS. RHS and CC will be set to SDValue(). NeedInvert
	/// will be set to false.
	///
	/// If the SETCC has been legalized by using getSetCCSwappedOperands(),
	/// then the values of LHS and RHS will be swapped, CC will be set to the
	/// new condition, and NeedInvert will be set to false.
	///
	/// If the SETCC has been legalized using the inverse condcode, then LHS and
	/// RHS will be unchanged, CC will set to the inverted condcode, and NeedInvert
	/// will be set to true. The caller must invert the result of the SETCC with
	/// SelectionDAG::getLogicalNOT() or take equivalent action to swap the effect
	/// of a true/false result.
	///
	/// \returns true if the SetCC has been legalized, false if it hasn't.
	bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS,
	SDValue &RHS, SDValue &CC,
	bool &NeedInvert,
	const SDLoc &dl) {
	MVT OpVT = LHS.getSimpleValueType();
	ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
	NeedInvert = false;
	switch (TLI.getCondCodeAction(CCCode, OpVT)) {
	default: llvm_unreachable("Unknown condition code action!");
	case TargetLowering::Legal:
	// Nothing to do.
	break;
	case TargetLowering::Expand: {
	ISD::CondCode InvCC = ISD::getSetCCSwappedOperands(CCCode);
	if (TLI.isCondCodeLegal(InvCC, OpVT)) {
	std::swap(LHS, RHS);
	CC = DAG.getCondCode(InvCC);
	return true;
	}
	ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID;
	unsigned Opc = 0;
	switch (CCCode) {
	default: llvm_unreachable("Don't know how to expand this condition!");
	case ISD::SETO:
	assert(TLI.getCondCodeAction(ISD::SETOEQ, OpVT)
	== TargetLowering::Legal
	&& "If SETO is expanded, SETOEQ must be legal!");
	CC1 = ISD::SETOEQ; CC2 = ISD::SETOEQ; Opc = ISD::AND; break;
	case ISD::SETUO:
	assert(TLI.getCondCodeAction(ISD::SETUNE, OpVT)
	== TargetLowering::Legal
	&& "If SETUO is expanded, SETUNE must be legal!");
	CC1 = ISD::SETUNE; CC2 = ISD::SETUNE; Opc = ISD::OR; break;
	case ISD::SETOEQ:
	case ISD::SETOGT:
	case ISD::SETOGE:
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETONE:
	case ISD::SETUEQ:
	case ISD::SETUNE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	case ISD::SETULT:
	case ISD::SETULE:
	// If we are floating point, assign and break, otherwise fall through.
	if (!OpVT.isInteger()) {
	// We can use the 4th bit to tell if we are the unordered
	// or ordered version of the opcode.
	CC2 = ((unsigned)CCCode & 0x8U) ? ISD::SETUO : ISD::SETO;
	Opc = ((unsigned)CCCode & 0x8U) ? ISD::OR : ISD::AND;
	CC1 = (ISD::CondCode)(((int)CCCode & 0x7) \| 0x10);
	break;
	}
	// Fallthrough if we are unsigned integer.
	LLVM_FALLTHROUGH;
	case ISD::SETLE:
	case ISD::SETGT:
	case ISD::SETGE:
	case ISD::SETLT:
	// We only support using the inverted operation, which is computed above
	// and not a different manner of supporting expanding these cases.
	llvm_unreachable("Don't know how to expand this condition!");
	case ISD::SETNE:
	case ISD::SETEQ:
	// Try inverting the result of the inverse condition.
	InvCC = CCCode == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
	if (TLI.isCondCodeLegal(InvCC, OpVT)) {
	CC = DAG.getCondCode(InvCC);
	NeedInvert = true;
	return true;
	}
	// If inverting the condition didn't work then we have no means to expand
	// the condition.
	llvm_unreachable("Don't know how to expand this condition!");
	}

	SDValue SetCC1, SetCC2;
	if (CCCode != ISD::SETO && CCCode != ISD::SETUO) {
	// If we aren't the ordered or unorder operation,
	// then the pattern is (LHS CC1 RHS) Opc (LHS CC2 RHS).
	SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1);
	SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2);
	} else {
	// Otherwise, the pattern is (LHS CC1 LHS) Opc (RHS CC2 RHS)
	SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1);
	SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2);
	}
	LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
	RHS = SDValue();
	CC = SDValue();
	return true;
	}
	}
	return false;
	}

	/// Emit a store/load combination to the stack. This stores
	/// SrcOp to a stack slot of type SlotVT, truncating it if needed. It then does
	/// a load from the stack slot to DestVT, extending it if needed.
	/// The resultant code need not be legal.
	SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
	EVT DestVT, const SDLoc &dl) {
	// Create the stack frame object.
	unsigned SrcAlign = DAG.getDataLayout().getPrefTypeAlignment(
	SrcOp.getValueType().getTypeForEVT(*DAG.getContext()));
	SDValue FIPtr = DAG.CreateStackTemporary(SlotVT, SrcAlign);

	FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(FIPtr);
	int SPFI = StackPtrFI->getIndex();
	MachinePointerInfo PtrInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

	unsigned SrcSize = SrcOp.getValueSizeInBits();
	unsigned SlotSize = SlotVT.getSizeInBits();
	unsigned DestSize = DestVT.getSizeInBits();
	Type DestType = DestVT.getTypeForEVT(DAG.getContext());
	unsigned DestAlign = DAG.getDataLayout().getPrefTypeAlignment(DestType);

	// Emit a store to the stack slot. Use a truncstore if the input value is
	// later than DestVT.
	SDValue Store;

	if (SrcSize > SlotSize)
	Store = DAG.getTruncStore(DAG.getEntryNode(), dl, SrcOp, FIPtr, PtrInfo,
	SlotVT, SrcAlign);
	else {
	assert(SrcSize == SlotSize && "Invalid store");
	Store =
	DAG.getStore(DAG.getEntryNode(), dl, SrcOp, FIPtr, PtrInfo, SrcAlign);
	}

	// Result is a load from the stack slot.
	if (SlotSize == DestSize)
	return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo, DestAlign);

	assert(SlotSize < DestSize && "Unknown extension!");
	return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, PtrInfo, SlotVT,
	DestAlign);
	}

	SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
	SDLoc dl(Node);
	// Create a vector sized/aligned stack slot, store the value to element #0,
	// then load the whole vector back out.
	SDValue StackPtr = DAG.CreateStackTemporary(Node->getValueType(0));

	FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(StackPtr);
	int SPFI = StackPtrFI->getIndex();

	SDValue Ch = DAG.getTruncStore(
	DAG.getEntryNode(), dl, Node->getOperand(0), StackPtr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI),
	Node->getValueType(0).getVectorElementType());
	return DAG.getLoad(
	Node->getValueType(0), dl, Ch, StackPtr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
	}

	static bool
	ExpandBVWithShuffles(SDNode *Node, SelectionDAG &DAG,
	const TargetLowering &TLI, SDValue &Res) {
	unsigned NumElems = Node->getNumOperands();
	SDLoc dl(Node);
	EVT VT = Node->getValueType(0);

	// Try to group the scalars into pairs, shuffle the pairs together, then
	// shuffle the pairs of pairs together, etc. until the vector has
	// been built. This will work only if all of the necessary shuffle masks
	// are legal.

	// We do this in two phases; first to check the legality of the shuffles,
	// and next, assuming that all shuffles are legal, to create the new nodes.
	for (int Phase = 0; Phase < 2; ++Phase) {
	SmallVector<std::pair<SDValue, SmallVector<int, 16>>, 16> IntermedVals,
	NewIntermedVals;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue V = Node->getOperand(i);
	if (V.isUndef())
	continue;

	SDValue Vec;
	if (Phase)
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, V);
	IntermedVals.push_back(std::make_pair(Vec, SmallVector<int, 16>(1, i)));
	}

	while (IntermedVals.size() > 2) {
	NewIntermedVals.clear();
	for (unsigned i = 0, e = (IntermedVals.size() & ~1u); i < e; i += 2) {
	// This vector and the next vector are shuffled together (simply to
	// append the one to the other).
	SmallVector<int, 16> ShuffleVec(NumElems, -1);

	SmallVector<int, 16> FinalIndices;
	FinalIndices.reserve(IntermedVals[i].second.size() +
	IntermedVals[i+1].second.size());

	int k = 0;
	for (unsigned j = 0, f = IntermedVals[i].second.size(); j != f;
	++j, ++k) {
	ShuffleVec[k] = j;
	FinalIndices.push_back(IntermedVals[i].second[j]);
	}
	for (unsigned j = 0, f = IntermedVals[i+1].second.size(); j != f;
	++j, ++k) {
	ShuffleVec[k] = NumElems + j;
	FinalIndices.push_back(IntermedVals[i+1].second[j]);
	}

	SDValue Shuffle;
	if (Phase)
	Shuffle = DAG.getVectorShuffle(VT, dl, IntermedVals[i].first,
	IntermedVals[i+1].first,
	ShuffleVec);
	else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
	return false;
	NewIntermedVals.push_back(
	std::make_pair(Shuffle, std::move(FinalIndices)));
	}

	// If we had an odd number of defined values, then append the last
	// element to the array of new vectors.
	if ((IntermedVals.size() & 1) != 0)
	NewIntermedVals.push_back(IntermedVals.back());

	IntermedVals.swap(NewIntermedVals);
	}

	assert(IntermedVals.size() <= 2 && IntermedVals.size() > 0 &&
	"Invalid number of intermediate vectors");
	SDValue Vec1 = IntermedVals[0].first;
	SDValue Vec2;
	if (IntermedVals.size() > 1)
	Vec2 = IntermedVals[1].first;
	else if (Phase)
	Vec2 = DAG.getUNDEF(VT);

	SmallVector<int, 16> ShuffleVec(NumElems, -1);
	for (unsigned i = 0, e = IntermedVals[0].second.size(); i != e; ++i)
	ShuffleVec[IntermedVals[0].second[i]] = i;
	for (unsigned i = 0, e = IntermedVals[1].second.size(); i != e; ++i)
	ShuffleVec[IntermedVals[1].second[i]] = NumElems + i;

	if (Phase)
	Res = DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec);
	else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
	return false;
	}

	return true;
	}

	/// Expand a BUILD_VECTOR node on targets that don't
	/// support the operation, but do support the resultant vector type.
	SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
	unsigned NumElems = Node->getNumOperands();
	SDValue Value1, Value2;
	SDLoc dl(Node);
	EVT VT = Node->getValueType(0);
	EVT OpVT = Node->getOperand(0).getValueType();
	EVT EltVT = VT.getVectorElementType();

	// If the only non-undef value is the low element, turn this into a
	// SCALAR_TO_VECTOR node. If this is { X, X, X, X }, determine X.
	bool isOnlyLowElement = true;
	bool MoreThanTwoValues = false;
	bool isConstant = true;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue V = Node->getOperand(i);
	if (V.isUndef())
	continue;
	if (i > 0)
	isOnlyLowElement = false;
	if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
	isConstant = false;

	if (!Value1.getNode()) {
	Value1 = V;
	} else if (!Value2.getNode()) {
	if (V != Value1)
	Value2 = V;
	} else if (V != Value1 && V != Value2) {
	MoreThanTwoValues = true;
	}
	}

	if (!Value1.getNode())
	return DAG.getUNDEF(VT);

	if (isOnlyLowElement)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Node->getOperand(0));

	// If all elements are constants, create a load from the constant pool.
	if (isConstant) {
	SmallVector<Constant*, 16> CV;
	for (unsigned i = 0, e = NumElems; i != e; ++i) {
	if (ConstantFPSDNode *V =
	dyn_cast<ConstantFPSDNode>(Node->getOperand(i))) {
	CV.push_back(const_cast<ConstantFP *>(V->getConstantFPValue()));
	} else if (ConstantSDNode *V =
	dyn_cast<ConstantSDNode>(Node->getOperand(i))) {
	if (OpVT==EltVT)
	CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
	else {
	// If OpVT and EltVT don't match, EltVT is not legal and the
	// element values have been promoted/truncated earlier. Undo this;
	// we don't want a v16i8 to become a v16i32 for example.
	const ConstantInt *CI = V->getConstantIntValue();
	CV.push_back(ConstantInt::get(EltVT.getTypeForEVT(*DAG.getContext()),
	CI->getZExtValue()));
	}
	} else {
	assert(Node->getOperand(i).isUndef());
	Type OpNTy = EltVT.getTypeForEVT(DAG.getContext());
	CV.push_back(UndefValue::get(OpNTy));
	}
	}
	Constant *CP = ConstantVector::get(CV);
	SDValue CPIdx =
	DAG.getConstantPool(CP, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
	return DAG.getLoad(
	VT, dl, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	}

	SmallSet<SDValue, 16> DefinedValues;
	for (unsigned i = 0; i < NumElems; ++i) {
	if (Node->getOperand(i).isUndef())
	continue;
	DefinedValues.insert(Node->getOperand(i));
	}

	if (TLI.shouldExpandBuildVectorWithShuffles(VT, DefinedValues.size())) {
	if (!MoreThanTwoValues) {
	SmallVector<int, 8> ShuffleVec(NumElems, -1);
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue V = Node->getOperand(i);
	if (V.isUndef())
	continue;
	ShuffleVec[i] = V == Value1 ? 0 : NumElems;
	}
	if (TLI.isShuffleMaskLegal(ShuffleVec, Node->getValueType(0))) {
	// Get the splatted value into the low element of a vector register.
	SDValue Vec1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value1);
	SDValue Vec2;
	if (Value2.getNode())
	Vec2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value2);
	else
	Vec2 = DAG.getUNDEF(VT);

	// Return shuffle(LowValVec, undef, <0,0,0,0>)
	return DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec);
	}
	} else {
	SDValue Res;
	if (ExpandBVWithShuffles(Node, DAG, TLI, Res))
	return Res;
	}
	}

	// Otherwise, we can't handle this case efficiently.
	return ExpandVectorBuildThroughStack(Node);
	}

	// Expand a node into a call to a libcall. If the result value
	// does not fit into a register, return the lo part and set the hi part to the
	// by-reg argument. If it does fit into a single register, return the result
	// and leave the Hi part unset.
	SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
	bool isSigned) {
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (const SDValue &Op : Node->op_values()) {
	EVT ArgVT = Op.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Node = Op;
	Entry.Ty = ArgTy;
	- Entry.IsSExt = isSigned;
	- Entry.IsZExt = !isSigned;
	+ Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, isSigned);
	+ Entry.IsZExt = !TLI.shouldSignExtendTypeInLibCall(ArgVT, isSigned);
	Args.push_back(Entry);
	}
	SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
	TLI.getPointerTy(DAG.getDataLayout()));

	- Type RetTy = Node->getValueType(0).getTypeForEVT(DAG.getContext());
	+ EVT RetVT = Node->getValueType(0);
	+ Type RetTy = RetVT.getTypeForEVT(DAG.getContext());

	// By default, the input chain to this libcall is the entry node of the
	// function. If the libcall is going to be emitted as a tail call then
	// TLI.isUsedByReturnOnly will change it to the right chain if the return
	// node which is being folded has a non-entry input chain.
	SDValue InChain = DAG.getEntryNode();

	// isTailCall may be true since the callee does not reference caller stack
	// frame. Check if it's in the right position and that the return types match.
	SDValue TCChain = InChain;
	const Function &F = DAG.getMachineFunction().getFunction();
	bool isTailCall =
	TLI.isInTailCallPosition(DAG, Node, TCChain) &&
	(RetTy == F.getReturnType() \|\| F.getReturnType()->isVoidTy());
	if (isTailCall)
	InChain = TCChain;

	TargetLowering::CallLoweringInfo CLI(DAG);
	+ bool signExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, isSigned);
	CLI.setDebugLoc(SDLoc(Node))
	.setChain(InChain)
	.setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
	std::move(Args))
	.setTailCall(isTailCall)
	- .setSExtResult(isSigned)
	- .setZExtResult(!isSigned)
	+ .setSExtResult(signExtend)
	+ .setZExtResult(!signExtend)
	.setIsPostTypeLegalization(true);

	std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);

	if (!CallInfo.second.getNode()) {
	DEBUG(dbgs() << "Created tailcall: "; DAG.getRoot().dump());
	// It's a tailcall, return the chain (which is the DAG root).
	return DAG.getRoot();
	}

	DEBUG(dbgs() << "Created libcall: "; CallInfo.first.dump());
	return CallInfo.first;
	}

	/// Generate a libcall taking the given operands as arguments
	/// and returning a result of type RetVT.
	SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
	const SDValue *Ops, unsigned NumOps,
	bool isSigned, const SDLoc &dl) {
	TargetLowering::ArgListTy Args;
	Args.reserve(NumOps);

	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 0; i != NumOps; ++i) {
	Entry.Node = Ops[i];
	Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
	Entry.IsSExt = isSigned;
	Entry.IsZExt = !isSigned;
	Args.push_back(Entry);
	}
	SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
	TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = RetVT.getTypeForEVT(DAG.getContext());

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
	std::move(Args))
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned)
	.setIsPostTypeLegalization(true);

	std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);

	return CallInfo.first;
	}

	// Expand a node into a call to a libcall. Similar to
	// ExpandLibCall except that the first operand is the in-chain.
	std::pair<SDValue, SDValue>
	SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
	SDNode *Node,
	bool isSigned) {
	SDValue InChain = Node->getOperand(0);

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) {
	EVT ArgVT = Node->getOperand(i).getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Node = Node->getOperand(i);
	Entry.Ty = ArgTy;
	Entry.IsSExt = isSigned;
	Entry.IsZExt = !isSigned;
	Args.push_back(Entry);
	}
	SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
	TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = Node->getValueType(0).getTypeForEVT(DAG.getContext());

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(SDLoc(Node))
	.setChain(InChain)
	.setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
	std::move(Args))
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);

	return CallInfo;
	}

	SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
	RTLIB::Libcall Call_F32,
	RTLIB::Libcall Call_F64,
	RTLIB::Libcall Call_F80,
	RTLIB::Libcall Call_F128,
	RTLIB::Libcall Call_PPCF128) {
	if (Node->isStrictFPOpcode())
	Node = DAG.mutateStrictFPToFP(Node);

	RTLIB::Libcall LC;
	switch (Node->getSimpleValueType(0).SimpleTy) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case MVT::f32: LC = Call_F32; break;
	case MVT::f64: LC = Call_F64; break;
	case MVT::f80: LC = Call_F80; break;
	case MVT::f128: LC = Call_F128; break;
	case MVT::ppcf128: LC = Call_PPCF128; break;
	}
	return ExpandLibCall(LC, Node, false);
	}

	SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
	RTLIB::Libcall Call_I8,
	RTLIB::Libcall Call_I16,
	RTLIB::Libcall Call_I32,
	RTLIB::Libcall Call_I64,
	RTLIB::Libcall Call_I128) {
	RTLIB::Libcall LC;
	switch (Node->getSimpleValueType(0).SimpleTy) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case MVT::i8: LC = Call_I8; break;
	case MVT::i16: LC = Call_I16; break;
	case MVT::i32: LC = Call_I32; break;
	case MVT::i64: LC = Call_I64; break;
	case MVT::i128: LC = Call_I128; break;
	}
	return ExpandLibCall(LC, Node, isSigned);
	}

	/// Issue libcalls to __{u}divmod to compute div / rem pairs.
	void
	SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
	SmallVectorImpl<SDValue> &Results) {
	unsigned Opcode = Node->getOpcode();
	bool isSigned = Opcode == ISD::SDIVREM;

	RTLIB::Libcall LC;
	switch (Node->getSimpleValueType(0).SimpleTy) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
	case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
	case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
	case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
	case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
	}

	// The input chain to this libcall is the entry node of the function.
	// Legalizing the call will automatically add the previous call to the
	// dependence.
	SDValue InChain = DAG.getEntryNode();

	EVT RetVT = Node->getValueType(0);
	Type RetTy = RetVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (const SDValue &Op : Node->op_values()) {
	EVT ArgVT = Op.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Node = Op;
	Entry.Ty = ArgTy;
	Entry.IsSExt = isSigned;
	Entry.IsZExt = !isSigned;
	Args.push_back(Entry);
	}

	// Also pass the return address of the remainder.
	SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
	Entry.Node = FIPtr;
	Entry.Ty = RetTy->getPointerTo();
	Entry.IsSExt = isSigned;
	Entry.IsZExt = !isSigned;
	Args.push_back(Entry);

	SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
	TLI.getPointerTy(DAG.getDataLayout()));

	SDLoc dl(Node);
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
	std::move(Args))
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);

	// Remainder is loaded back from the stack frame.
	SDValue Rem =
	DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr, MachinePointerInfo());
	Results.push_back(CallInfo.first);
	Results.push_back(Rem);
	}

	/// Return true if sincos libcall is available.
	static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
	RTLIB::Libcall LC;
	switch (Node->getSimpleValueType(0).SimpleTy) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case MVT::f32: LC = RTLIB::SINCOS_F32; break;
	case MVT::f64: LC = RTLIB::SINCOS_F64; break;
	case MVT::f80: LC = RTLIB::SINCOS_F80; break;
	case MVT::f128: LC = RTLIB::SINCOS_F128; break;
	case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
	}
	return TLI.getLibcallName(LC) != nullptr;
	}

	/// Only issue sincos libcall if both sin and cos are needed.
	static bool useSinCos(SDNode *Node) {
	unsigned OtherOpcode = Node->getOpcode() == ISD::FSIN
	? ISD::FCOS : ISD::FSIN;

	SDValue Op0 = Node->getOperand(0);
	for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
	UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User == Node)
	continue;
	// The other user might have been turned into sincos already.
	if (User->getOpcode() == OtherOpcode \|\| User->getOpcode() == ISD::FSINCOS)
	return true;
	}
	return false;
	}

	/// Issue libcalls to sincos to compute sin / cos pairs.
	void
	SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
	SmallVectorImpl<SDValue> &Results) {
	RTLIB::Libcall LC;
	switch (Node->getSimpleValueType(0).SimpleTy) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case MVT::f32: LC = RTLIB::SINCOS_F32; break;
	case MVT::f64: LC = RTLIB::SINCOS_F64; break;
	case MVT::f80: LC = RTLIB::SINCOS_F80; break;
	case MVT::f128: LC = RTLIB::SINCOS_F128; break;
	case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
	}

	// The input chain to this libcall is the entry node of the function.
	// Legalizing the call will automatically add the previous call to the
	// dependence.
	SDValue InChain = DAG.getEntryNode();

	EVT RetVT = Node->getValueType(0);
	Type RetTy = RetVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	// Pass the argument.
	Entry.Node = Node->getOperand(0);
	Entry.Ty = RetTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	// Pass the return address of sin.
	SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
	Entry.Node = SinPtr;
	Entry.Ty = RetTy->getPointerTo();
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	// Also pass the return address of the cos.
	SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
	Entry.Node = CosPtr;
	Entry.Ty = RetTy->getPointerTo();
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
	TLI.getPointerTy(DAG.getDataLayout()));

	SDLoc dl(Node);
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl).setChain(InChain).setLibCallee(
	TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
	std::move(Args));

	std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);

	Results.push_back(
	DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr, MachinePointerInfo()));
	Results.push_back(
	DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo()));
	}

	/// This function is responsible for legalizing a
	/// INT_TO_FP operation of the specified operand when the target requests that
	/// we expand it. At this point, we know that the result and operand types are
	/// legal for the target.
	SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
	EVT DestVT,
	const SDLoc &dl) {
	// TODO: Should any fast-math-flags be set for the created nodes?
	DEBUG(dbgs() << "Legalizing INT_TO_FP\n");
	if (Op0.getValueType() == MVT::i32 && TLI.isTypeLegal(MVT::f64)) {
	DEBUG(dbgs() << "32-bit [signed\|unsigned] integer to float/double "
	"expansion\n");

	// Get the stack frame index of a 8 byte buffer.
	SDValue StackSlot = DAG.CreateStackTemporary(MVT::f64);

	// word offset constant for Hi/Lo address computation
	SDValue WordOff = DAG.getConstant(sizeof(int), dl,
	StackSlot.getValueType());
	// set up Hi and Lo (into buffer) address based on endian
	SDValue Hi = StackSlot;
	SDValue Lo = DAG.getNode(ISD::ADD, dl, StackSlot.getValueType(),
	StackSlot, WordOff);
	if (DAG.getDataLayout().isLittleEndian())
	std::swap(Hi, Lo);

	// if signed map to unsigned space
	SDValue Op0Mapped;
	if (isSigned) {
	// constant used to invert sign bit (signed to unsigned mapping)
	SDValue SignBit = DAG.getConstant(0x80000000u, dl, MVT::i32);
	Op0Mapped = DAG.getNode(ISD::XOR, dl, MVT::i32, Op0, SignBit);
	} else {
	Op0Mapped = Op0;
	}
	// store the lo of the constructed double - based on integer input
	SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op0Mapped, Lo,
	MachinePointerInfo());
	// initial hi portion of constructed double
	SDValue InitialHi = DAG.getConstant(0x43300000u, dl, MVT::i32);
	// store the hi of the constructed double - biased exponent
	SDValue Store2 =
	DAG.getStore(Store1, dl, InitialHi, Hi, MachinePointerInfo());
	// load the constructed double
	SDValue Load =
	DAG.getLoad(MVT::f64, dl, Store2, StackSlot, MachinePointerInfo());
	// FP constant to bias correct the final result
	SDValue Bias = DAG.getConstantFP(isSigned ?
	BitsToDouble(0x4330000080000000ULL) :
	BitsToDouble(0x4330000000000000ULL),
	dl, MVT::f64);
	// subtract the bias
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias);
	// final result
	SDValue Result;
	// handle final rounding
	if (DestVT == MVT::f64) {
	// do nothing
	Result = Sub;
	} else if (DestVT.bitsLT(MVT::f64)) {
	Result = DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
	DAG.getIntPtrConstant(0, dl));
	} else if (DestVT.bitsGT(MVT::f64)) {
	Result = DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
	}
	return Result;
	}
	assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
	// Code below here assumes !isSigned without checking again.

	// Implementation of unsigned i64 to f64 following the algorithm in
	// __floatundidf in compiler_rt. This implementation has the advantage
	// of performing rounding correctly, both in the default rounding mode
	// and in all alternate rounding modes.
	// TODO: Generalize this for use with other types.
	if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f64) {
	DEBUG(dbgs() << "Converting unsigned i64 to f64\n");
	SDValue TwoP52 =
	DAG.getConstant(UINT64_C(0x4330000000000000), dl, MVT::i64);
	SDValue TwoP84PlusTwoP52 =
	DAG.getConstantFP(BitsToDouble(UINT64_C(0x4530000000100000)), dl,
	MVT::f64);
	SDValue TwoP84 =
	DAG.getConstant(UINT64_C(0x4530000000000000), dl, MVT::i64);

	SDValue Lo = DAG.getZeroExtendInReg(Op0, dl, MVT::i32);
	SDValue Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0,
	DAG.getConstant(32, dl, MVT::i64));
	SDValue LoOr = DAG.getNode(ISD::OR, dl, MVT::i64, Lo, TwoP52);
	SDValue HiOr = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, TwoP84);
	SDValue LoFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, LoOr);
	SDValue HiFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, HiOr);
	SDValue HiSub = DAG.getNode(ISD::FSUB, dl, MVT::f64, HiFlt,
	TwoP84PlusTwoP52);
	return DAG.getNode(ISD::FADD, dl, MVT::f64, LoFlt, HiSub);
	}

	// TODO: Generalize this for use with other types.
	if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f32) {
	DEBUG(dbgs() << "Converting unsigned i64 to f32\n");
	// For unsigned conversions, convert them to signed conversions using the
	// algorithm from the x86_64 __floatundidf in compiler_rt.
	if (!isSigned) {
	SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Op0);

	SDValue ShiftConst = DAG.getConstant(
	1, dl, TLI.getShiftAmountTy(Op0.getValueType(), DAG.getDataLayout()));
	SDValue Shr = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0, ShiftConst);
	SDValue AndConst = DAG.getConstant(1, dl, MVT::i64);
	SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, AndConst);
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And, Shr);

	SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Or);
	SDValue Slow = DAG.getNode(ISD::FADD, dl, MVT::f32, SignCvt, SignCvt);

	// TODO: This really should be implemented using a branch rather than a
	// select. We happen to get lucky and machinesink does the right
	// thing most of the time. This would be a good candidate for a
	//pseudo-op, or, even better, for whole-function isel.
	SDValue SignBitTest = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
	Op0, DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
	return DAG.getSelect(dl, MVT::f32, SignBitTest, Slow, Fast);
	}

	// Otherwise, implement the fully general conversion.

	SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
	DAG.getConstant(UINT64_C(0xfffffffffffff800), dl, MVT::i64));
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And,
	DAG.getConstant(UINT64_C(0x800), dl, MVT::i64));
	SDValue And2 = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
	DAG.getConstant(UINT64_C(0x7ff), dl, MVT::i64));
	SDValue Ne = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), And2,
	DAG.getConstant(UINT64_C(0), dl, MVT::i64),
	ISD::SETNE);
	SDValue Sel = DAG.getSelect(dl, MVT::i64, Ne, Or, Op0);
	SDValue Ge = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), Op0,
	DAG.getConstant(UINT64_C(0x0020000000000000), dl,
	MVT::i64),
	ISD::SETUGE);
	SDValue Sel2 = DAG.getSelect(dl, MVT::i64, Ge, Sel, Op0);
	EVT SHVT = TLI.getShiftAmountTy(Sel2.getValueType(), DAG.getDataLayout());

	SDValue Sh = DAG.getNode(ISD::SRL, dl, MVT::i64, Sel2,
	DAG.getConstant(32, dl, SHVT));
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sh);
	SDValue Fcvt = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Trunc);
	SDValue TwoP32 =
	DAG.getConstantFP(BitsToDouble(UINT64_C(0x41f0000000000000)), dl,
	MVT::f64);
	SDValue Fmul = DAG.getNode(ISD::FMUL, dl, MVT::f64, TwoP32, Fcvt);
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sel2);
	SDValue Fcvt2 = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Lo);
	SDValue Fadd = DAG.getNode(ISD::FADD, dl, MVT::f64, Fmul, Fcvt2);
	return DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Fadd,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);

	SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(Op0.getValueType()),
	Op0,
	DAG.getConstant(0, dl, Op0.getValueType()),
	ISD::SETLT);
	SDValue Zero = DAG.getIntPtrConstant(0, dl),
	Four = DAG.getIntPtrConstant(4, dl);
	SDValue CstOffset = DAG.getSelect(dl, Zero.getValueType(),
	SignSet, Four, Zero);

	// If the sign bit of the integer is set, the large number will be treated
	// as a negative number. To counteract this, the dynamic code adds an
	// offset depending on the data type.
	uint64_t FF;
	switch (Op0.getSimpleValueType().SimpleTy) {
	default: llvm_unreachable("Unsupported integer type!");
	case MVT::i8 : FF = 0x43800000ULL; break; // 2^8 (as a float)
	case MVT::i16: FF = 0x47800000ULL; break; // 2^16 (as a float)
	case MVT::i32: FF = 0x4F800000ULL; break; // 2^32 (as a float)
	case MVT::i64: FF = 0x5F800000ULL; break; // 2^64 (as a float)
	}
	if (DAG.getDataLayout().isLittleEndian())
	FF <<= 32;
	Constant *FudgeFactor = ConstantInt::get(
	Type::getInt64Ty(*DAG.getContext()), FF);

	SDValue CPIdx =
	DAG.getConstantPool(FudgeFactor, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
	CPIdx = DAG.getNode(ISD::ADD, dl, CPIdx.getValueType(), CPIdx, CstOffset);
	Alignment = std::min(Alignment, 4u);
	SDValue FudgeInReg;
	if (DestVT == MVT::f32)
	FudgeInReg = DAG.getLoad(
	MVT::f32, dl, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	else {
	SDValue Load = DAG.getExtLoad(
	ISD::EXTLOAD, dl, DestVT, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
	Alignment);
	HandleSDNode Handle(Load);
	LegalizeOp(Load.getNode());
	FudgeInReg = Handle.getValue();
	}

	return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg);
	}

	/// This function is responsible for legalizing a
	/// *INT_TO_FP operation of the specified operand when the target requests that
	/// we promote it. At this point, we know that the result and operand types are
	/// legal for the target, and that there is a legal UINT_TO_FP or SINT_TO_FP
	/// operation that takes a larger input.
	SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT,
	bool isSigned,
	const SDLoc &dl) {
	// First step, figure out the appropriate *INT_TO_FP operation to use.
	EVT NewInTy = LegalOp.getValueType();

	unsigned OpToUse = 0;

	// Scan for the appropriate larger type to use.
	while (true) {
	NewInTy = (MVT::SimpleValueType)(NewInTy.getSimpleVT().SimpleTy+1);
	assert(NewInTy.isInteger() && "Ran out of possibilities!");

	// If the target supports SINT_TO_FP of this type, use it.
	if (TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, NewInTy)) {
	OpToUse = ISD::SINT_TO_FP;
	break;
	}
	if (isSigned) continue;

	// If the target supports UINT_TO_FP of this type, use it.
	if (TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, NewInTy)) {
	OpToUse = ISD::UINT_TO_FP;
	break;
	}

	// Otherwise, try a larger type.
	}

	// Okay, we found the operation and type to use. Zero extend our input to the
	// desired type then run the operation on it.
	return DAG.getNode(OpToUse, dl, DestVT,
	DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
	dl, NewInTy, LegalOp));
	}

	/// This function is responsible for legalizing a
	/// FP_TO_*INT operation of the specified operand when the target requests that
	/// we promote it. At this point, we know that the result and operand types are
	/// legal for the target, and that there is a legal FP_TO_UINT or FP_TO_SINT
	/// operation that returns a larger result.
	SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT,
	bool isSigned,
	const SDLoc &dl) {
	// First step, figure out the appropriate FP_TO*INT operation to use.
	EVT NewOutTy = DestVT;

	unsigned OpToUse = 0;

	// Scan for the appropriate larger type to use.
	while (true) {
	NewOutTy = (MVT::SimpleValueType)(NewOutTy.getSimpleVT().SimpleTy+1);
	assert(NewOutTy.isInteger() && "Ran out of possibilities!");

	// A larger signed type can hold all unsigned values of the requested type,
	// so using FP_TO_SINT is valid
	if (TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NewOutTy)) {
	OpToUse = ISD::FP_TO_SINT;
	break;
	}

	// However, if the value may be < 0.0, we must use some FP_TO_SINT.
	if (!isSigned && TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NewOutTy)) {
	OpToUse = ISD::FP_TO_UINT;
	break;
	}

	// Otherwise, try a larger type.
	}

	// Okay, we found the operation and type to use.
	SDValue Operation = DAG.getNode(OpToUse, dl, NewOutTy, LegalOp);

	// Truncate the result of the extended FP_TO_*INT operation to the desired
	// size.
	return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);
	}

	/// Legalize a BITREVERSE scalar/vector operation as a series of mask + shifts.
	SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {
	EVT VT = Op.getValueType();
	EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
	unsigned Sz = VT.getScalarSizeInBits();

	SDValue Tmp, Tmp2, Tmp3;

	// If we can, perform BSWAP first and then the mask+swap the i4, then i2
	// and finally the i1 pairs.
	// TODO: We can easily support i4/i2 legal types if any target ever does.
	if (Sz >= 8 && isPowerOf2_32(Sz)) {
	// Create the masks - repeating the pattern every byte.
	APInt MaskHi4(Sz, 0), MaskHi2(Sz, 0), MaskHi1(Sz, 0);
	APInt MaskLo4(Sz, 0), MaskLo2(Sz, 0), MaskLo1(Sz, 0);
	for (unsigned J = 0; J != Sz; J += 8) {
	MaskHi4 = MaskHi4 \| (0xF0ull << J);
	MaskLo4 = MaskLo4 \| (0x0Full << J);
	MaskHi2 = MaskHi2 \| (0xCCull << J);
	MaskLo2 = MaskLo2 \| (0x33ull << J);
	MaskHi1 = MaskHi1 \| (0xAAull << J);
	MaskLo1 = MaskLo1 \| (0x55ull << J);
	}

	// BSWAP if the type is wider than a single byte.
	Tmp = (Sz > 8 ? DAG.getNode(ISD::BSWAP, dl, VT, Op) : Op);

	// swap i4: ((V & 0xF0) >> 4) \| ((V & 0x0F) << 4)
	Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi4, dl, VT));
	Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo4, dl, VT));
	Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, VT));
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, VT));
	Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);

	// swap i2: ((V & 0xCC) >> 2) \| ((V & 0x33) << 2)
	Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi2, dl, VT));
	Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo2, dl, VT));
	Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, VT));
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, VT));
	Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);

	// swap i1: ((V & 0xAA) >> 1) \| ((V & 0x55) << 1)
	Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi1, dl, VT));
	Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo1, dl, VT));
	Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, VT));
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, VT));
	Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
	return Tmp;
	}

	Tmp = DAG.getConstant(0, dl, VT);
	for (unsigned I = 0, J = Sz-1; I < Sz; ++I, --J) {
	if (I < J)
	Tmp2 =
	DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(J - I, dl, SHVT));
	else
	Tmp2 =
	DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT));

	APInt Shift(Sz, 1);
	Shift <<= J;
	Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT));
	Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp, Tmp2);
	}

	return Tmp;
	}

	/// Open code the operations for BSWAP of the specified operation.
	SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) {
	EVT VT = Op.getValueType();
	EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
	SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6, Tmp7, Tmp8;
	switch (VT.getSimpleVT().getScalarType().SimpleTy) {
	default: llvm_unreachable("Unhandled Expand type in BSWAP!");
	case MVT::i16:
	Tmp2 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
	Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
	return DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
	case MVT::i32:
	Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
	Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
	Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
	Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3,
	DAG.getConstant(0xFF0000, dl, VT));
	Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(0xFF00, dl, VT));
	Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
	Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
	return DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
	case MVT::i64:
	Tmp8 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
	Tmp7 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
	Tmp6 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
	Tmp5 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
	Tmp4 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
	Tmp3 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
	Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
	Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
	Tmp7 = DAG.getNode(ISD::AND, dl, VT, Tmp7,
	DAG.getConstant(255ULL<<48, dl, VT));
	Tmp6 = DAG.getNode(ISD::AND, dl, VT, Tmp6,
	DAG.getConstant(255ULL<<40, dl, VT));
	Tmp5 = DAG.getNode(ISD::AND, dl, VT, Tmp5,
	DAG.getConstant(255ULL<<32, dl, VT));
	Tmp4 = DAG.getNode(ISD::AND, dl, VT, Tmp4,
	DAG.getConstant(255ULL<<24, dl, VT));
	Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3,
	DAG.getConstant(255ULL<<16, dl, VT));
	Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2,
	DAG.getConstant(255ULL<<8 , dl, VT));
	Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp7);
	Tmp6 = DAG.getNode(ISD::OR, dl, VT, Tmp6, Tmp5);
	Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
	Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
	Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp6);
	Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
	return DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp4);
	}
	}

	/// Expand the specified bitcount instruction into operations.
	SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
	const SDLoc &dl) {
	switch (Opc) {
	default: llvm_unreachable("Cannot expand this yet!");
	case ISD::CTPOP: {
	EVT VT = Op.getValueType();
	EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
	unsigned Len = VT.getSizeInBits();

	assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 &&
	"CTPOP not implemented for this type.");

	// This is the "best" algorithm from
	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel

	SDValue Mask55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)),
	dl, VT);
	SDValue Mask33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)),
	dl, VT);
	SDValue Mask0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)),
	dl, VT);
	SDValue Mask01 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)),
	dl, VT);

	// v = v - ((v >> 1) & 0x55555555...)
	Op = DAG.getNode(ISD::SUB, dl, VT, Op,
	DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRL, dl, VT, Op,
	DAG.getConstant(1, dl, ShVT)),
	Mask55));
	// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
	Op = DAG.getNode(ISD::ADD, dl, VT,
	DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
	DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRL, dl, VT, Op,
	DAG.getConstant(2, dl, ShVT)),
	Mask33));
	// v = (v + (v >> 4)) & 0x0F0F0F0F...
	Op = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::ADD, dl, VT, Op,
	DAG.getNode(ISD::SRL, dl, VT, Op,
	DAG.getConstant(4, dl, ShVT))),
	Mask0F);
	// v = (v * 0x01010101...) >> (Len - 8)
	Op = DAG.getNode(ISD::SRL, dl, VT,
	DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
	DAG.getConstant(Len - 8, dl, ShVT));

	return Op;
	}
	case ISD::CTLZ_ZERO_UNDEF:
	// This trivially expands to CTLZ.
	return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op);
	case ISD::CTLZ: {
	EVT VT = Op.getValueType();
	unsigned Len = VT.getSizeInBits();

	if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
	EVT SetCCVT = getSetCCResultType(VT);
	SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
	return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
	DAG.getConstant(Len, dl, VT), CTLZ);
	}

	// for now, we do this:
	// x = x \| (x >> 1);
	// x = x \| (x >> 2);
	// ...
	// x = x \| (x >>16);
	// x = x \| (x >>32); // for 64-bit input
	// return popcount(~x);
	//
	// Ref: "Hacker's Delight" by Henry Warren
	EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
	for (unsigned i = 0; (1U << i) <= (Len / 2); ++i) {
	SDValue Tmp3 = DAG.getConstant(1ULL << i, dl, ShVT);
	Op = DAG.getNode(ISD::OR, dl, VT, Op,
	DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3));
	}
	Op = DAG.getNOT(dl, Op, VT);
	return DAG.getNode(ISD::CTPOP, dl, VT, Op);
	}
	case ISD::CTTZ_ZERO_UNDEF:
	// This trivially expands to CTTZ.
	return DAG.getNode(ISD::CTTZ, dl, Op.getValueType(), Op);
	case ISD::CTTZ: {
	EVT VT = Op.getValueType();
	unsigned Len = VT.getSizeInBits();

	if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
	EVT SetCCVT = getSetCCResultType(VT);
	SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
	return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
	DAG.getConstant(Len, dl, VT), CTTZ);
	}

	// for now, we use: { return popcount(~x & (x - 1)); }
	// unless the target has ctlz but not ctpop, in which case we use:
	// { return 32 - nlz(~x & (x-1)); }
	// Ref: "Hacker's Delight" by Henry Warren
	SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNOT(dl, Op, VT),
	DAG.getNode(ISD::SUB, dl, VT, Op,
	DAG.getConstant(1, dl, VT)));
	// If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
	if (!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
	TLI.isOperationLegalOrCustom(ISD::CTLZ, VT))
	return DAG.getNode(ISD::SUB, dl, VT,
	DAG.getConstant(VT.getSizeInBits(), dl, VT),
	DAG.getNode(ISD::CTLZ, dl, VT, Tmp3));
	return DAG.getNode(ISD::CTPOP, dl, VT, Tmp3);
	}
	}
	}

	bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
	DEBUG(dbgs() << "Trying to expand node\n");
	SmallVector<SDValue, 8> Results;
	SDLoc dl(Node);
	SDValue Tmp1, Tmp2, Tmp3, Tmp4;
	bool NeedInvert;
	switch (Node->getOpcode()) {
	case ISD::CTPOP:
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	Tmp1 = ExpandBitCount(Node->getOpcode(), Node->getOperand(0), dl);
	Results.push_back(Tmp1);
	break;
	case ISD::BITREVERSE:
	Results.push_back(ExpandBITREVERSE(Node->getOperand(0), dl));
	break;
	case ISD::BSWAP:
	Results.push_back(ExpandBSWAP(Node->getOperand(0), dl));
	break;
	case ISD::FRAMEADDR:
	case ISD::RETURNADDR:
	case ISD::FRAME_TO_ARGS_OFFSET:
	Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0)));
	break;
	case ISD::EH_DWARF_CFA: {
	SDValue CfaArg = DAG.getSExtOrTrunc(Node->getOperand(0), dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	SDValue Offset = DAG.getNode(ISD::ADD, dl,
	CfaArg.getValueType(),
	DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, dl,
	CfaArg.getValueType()),
	CfaArg);
	SDValue FA = DAG.getNode(
	ISD::FRAMEADDR, dl, TLI.getPointerTy(DAG.getDataLayout()),
	DAG.getConstant(0, dl, TLI.getPointerTy(DAG.getDataLayout())));
	Results.push_back(DAG.getNode(ISD::ADD, dl, FA.getValueType(),
	FA, Offset));
	break;
	}
	case ISD::FLT_ROUNDS_:
	Results.push_back(DAG.getConstant(1, dl, Node->getValueType(0)));
	break;
	case ISD::EH_RETURN:
	case ISD::EH_LABEL:
	case ISD::PREFETCH:
	case ISD::VAEND:
	case ISD::EH_SJLJ_LONGJMP:
	// If the target didn't expand these, there's nothing to do, so just
	// preserve the chain and be done.
	Results.push_back(Node->getOperand(0));
	break;
	case ISD::READCYCLECOUNTER:
	// If the target didn't expand this, just return 'zero' and preserve the
	// chain.
	Results.append(Node->getNumValues() - 1,
	DAG.getConstant(0, dl, Node->getValueType(0)));
	Results.push_back(Node->getOperand(0));
	break;
	case ISD::EH_SJLJ_SETJMP:
	// If the target didn't expand this, just return 'zero' and preserve the
	// chain.
	Results.push_back(DAG.getConstant(0, dl, MVT::i32));
	Results.push_back(Node->getOperand(0));
	break;
	case ISD::ATOMIC_LOAD: {
	// There is no libcall for atomic load; fake it with ATOMIC_CMP_SWAP.
	SDValue Zero = DAG.getConstant(0, dl, Node->getValueType(0));
	SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
	SDValue Swap = DAG.getAtomicCmpSwap(
	ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
	Node->getOperand(0), Node->getOperand(1), Zero, Zero,
	cast<AtomicSDNode>(Node)->getMemOperand());
	Results.push_back(Swap.getValue(0));
	Results.push_back(Swap.getValue(1));
	break;
	}
	case ISD::ATOMIC_STORE: {
	// There is no libcall for atomic store; fake it with ATOMIC_SWAP.
	SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
	cast<AtomicSDNode>(Node)->getMemoryVT(),
	Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2),
	cast<AtomicSDNode>(Node)->getMemOperand());
	Results.push_back(Swap.getValue(1));
	break;
	}
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
	// Expanding an ATOMIC_CMP_SWAP_WITH_SUCCESS produces an ATOMIC_CMP_SWAP and
	// splits out the success value as a comparison. Expanding the resulting
	// ATOMIC_CMP_SWAP will produce a libcall.
	SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
	SDValue Res = DAG.getAtomicCmpSwap(
	ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
	Node->getOperand(0), Node->getOperand(1), Node->getOperand(2),
	Node->getOperand(3), cast<MemSDNode>(Node)->getMemOperand());

	SDValue ExtRes = Res;
	SDValue LHS = Res;
	SDValue RHS = Node->getOperand(1);

	EVT AtomicType = cast<AtomicSDNode>(Node)->getMemoryVT();
	EVT OuterType = Node->getValueType(0);
	switch (TLI.getExtendForAtomicOps()) {
	case ISD::SIGN_EXTEND:
	LHS = DAG.getNode(ISD::AssertSext, dl, OuterType, Res,
	DAG.getValueType(AtomicType));
	RHS = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, OuterType,
	Node->getOperand(2), DAG.getValueType(AtomicType));
	ExtRes = LHS;
	break;
	case ISD::ZERO_EXTEND:
	LHS = DAG.getNode(ISD::AssertZext, dl, OuterType, Res,
	DAG.getValueType(AtomicType));
	RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType);
	ExtRes = LHS;
	break;
	case ISD::ANY_EXTEND:
	LHS = DAG.getZeroExtendInReg(Res, dl, AtomicType);
	RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType);
	break;
	default:
	llvm_unreachable("Invalid atomic op extension");
	}

	SDValue Success =
	DAG.getSetCC(dl, Node->getValueType(1), LHS, RHS, ISD::SETEQ);

	Results.push_back(ExtRes.getValue(0));
	Results.push_back(Success);
	Results.push_back(Res.getValue(1));
	break;
	}
	case ISD::DYNAMIC_STACKALLOC:
	ExpandDYNAMIC_STACKALLOC(Node, Results);
	break;
	case ISD::MERGE_VALUES:
	for (unsigned i = 0; i < Node->getNumValues(); i++)
	Results.push_back(Node->getOperand(i));
	break;
	case ISD::UNDEF: {
	EVT VT = Node->getValueType(0);
	if (VT.isInteger())
	Results.push_back(DAG.getConstant(0, dl, VT));
	else {
	assert(VT.isFloatingPoint() && "Unknown value type!");
	Results.push_back(DAG.getConstantFP(0, dl, VT));
	}
	break;
	}
	case ISD::FP_ROUND:
	case ISD::BITCAST:
	Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
	Node->getValueType(0), dl);
	Results.push_back(Tmp1);
	break;
	case ISD::FP_EXTEND:
	Tmp1 = EmitStackConvert(Node->getOperand(0),
	Node->getOperand(0).getValueType(),
	Node->getValueType(0), dl);
	Results.push_back(Tmp1);
	break;
	case ISD::SIGN_EXTEND_INREG: {
	EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
	EVT VT = Node->getValueType(0);

	// An in-register sign-extend of a boolean is a negation:
	// 'true' (1) sign-extended is -1.
	// 'false' (0) sign-extended is 0.
	// However, we must mask the high bits of the source operand because the
	// SIGN_EXTEND_INREG does not guarantee that the high bits are already zero.

	// TODO: Do this for vectors too?
	if (ExtraVT.getSizeInBits() == 1) {
	SDValue One = DAG.getConstant(1, dl, VT);
	SDValue And = DAG.getNode(ISD::AND, dl, VT, Node->getOperand(0), One);
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue Neg = DAG.getNode(ISD::SUB, dl, VT, Zero, And);
	Results.push_back(Neg);
	break;
	}

	// NOTE: we could fall back on load/store here too for targets without
	// SRA. However, it is doubtful that any exist.
	EVT ShiftAmountTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
	unsigned BitsDiff = VT.getScalarSizeInBits() -
	ExtraVT.getScalarSizeInBits();
	SDValue ShiftCst = DAG.getConstant(BitsDiff, dl, ShiftAmountTy);
	Tmp1 = DAG.getNode(ISD::SHL, dl, Node->getValueType(0),
	Node->getOperand(0), ShiftCst);
	Tmp1 = DAG.getNode(ISD::SRA, dl, Node->getValueType(0), Tmp1, ShiftCst);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::FP_ROUND_INREG: {
	// The only way we can lower this is to turn it into a TRUNCSTORE,
	// EXTLOAD pair, targeting a temporary location (a stack slot).

	// NOTE: there is a choice here between constantly creating new stack
	// slots and always reusing the same one. We currently always create
	// new ones, as reuse may inhibit scheduling.
	EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
	Tmp1 = EmitStackConvert(Node->getOperand(0), ExtraVT,
	Node->getValueType(0), dl);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP,
	Node->getOperand(0), Node->getValueType(0), dl);
	Results.push_back(Tmp1);
	break;
	case ISD::FP_TO_SINT:
	if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG))
	Results.push_back(Tmp1);
	break;
	case ISD::FP_TO_UINT: {
	SDValue True, False;
	EVT VT = Node->getOperand(0).getValueType();
	EVT NVT = Node->getValueType(0);
	APFloat apf(DAG.EVTToAPFloatSemantics(VT),
	APInt::getNullValue(VT.getSizeInBits()));
	APInt x = APInt::getSignMask(NVT.getSizeInBits());
	(void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
	Tmp1 = DAG.getConstantFP(apf, dl, VT);
	Tmp2 = DAG.getSetCC(dl, getSetCCResultType(VT),
	Node->getOperand(0),
	Tmp1, ISD::SETLT);
	True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
	// TODO: Should any fast-math-flags be set for the FSUB?
	False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT,
	DAG.getNode(ISD::FSUB, dl, VT,
	Node->getOperand(0), Tmp1));
	False = DAG.getNode(ISD::XOR, dl, NVT, False,
	DAG.getConstant(x, dl, NVT));
	Tmp1 = DAG.getSelect(dl, NVT, Tmp2, True, False);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::VAARG:
	Results.push_back(DAG.expandVAArg(Node));
	Results.push_back(Results[0].getValue(1));
	break;
	case ISD::VACOPY:
	Results.push_back(DAG.expandVACopy(Node));
	break;
	case ISD::EXTRACT_VECTOR_ELT:
	if (Node->getOperand(0).getValueType().getVectorNumElements() == 1)
	// This must be an access of the only element. Return it.
	Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0),
	Node->getOperand(0));
	else
	Tmp1 = ExpandExtractFromVectorThroughStack(SDValue(Node, 0));
	Results.push_back(Tmp1);
	break;
	case ISD::EXTRACT_SUBVECTOR:
	Results.push_back(ExpandExtractFromVectorThroughStack(SDValue(Node, 0)));
	break;
	case ISD::INSERT_SUBVECTOR:
	Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
	break;
	case ISD::CONCAT_VECTORS:
	Results.push_back(ExpandVectorBuildThroughStack(Node));
	break;
	case ISD::SCALAR_TO_VECTOR:
	Results.push_back(ExpandSCALAR_TO_VECTOR(Node));
	break;
	case ISD::INSERT_VECTOR_ELT:
	Results.push_back(ExpandINSERT_VECTOR_ELT(Node->getOperand(0),
	Node->getOperand(1),
	Node->getOperand(2), dl));
	break;
	case ISD::VECTOR_SHUFFLE: {
	SmallVector<int, 32> NewMask;
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Node)->getMask();

	EVT VT = Node->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDValue Op0 = Node->getOperand(0);
	SDValue Op1 = Node->getOperand(1);
	if (!TLI.isTypeLegal(EltVT)) {
	EVT NewEltVT = TLI.getTypeToTransformTo(*DAG.getContext(), EltVT);

	// BUILD_VECTOR operands are allowed to be wider than the element type.
	// But if NewEltVT is smaller that EltVT the BUILD_VECTOR does not accept
	// it.
	if (NewEltVT.bitsLT(EltVT)) {
	// Convert shuffle node.
	// If original node was v4i64 and the new EltVT is i32,
	// cast operands to v8i32 and re-build the mask.

	// Calculate new VT, the size of the new VT should be equal to original.
	EVT NewVT =
	EVT::getVectorVT(*DAG.getContext(), NewEltVT,
	VT.getSizeInBits() / NewEltVT.getSizeInBits());
	assert(NewVT.bitsEq(VT));

	// cast operands to new VT
	Op0 = DAG.getNode(ISD::BITCAST, dl, NewVT, Op0);
	Op1 = DAG.getNode(ISD::BITCAST, dl, NewVT, Op1);

	// Convert the shuffle mask
	unsigned int factor =
	NewVT.getVectorNumElements()/VT.getVectorNumElements();

	// EltVT gets smaller
	assert(factor > 0);

	for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
	if (Mask[i] < 0) {
	for (unsigned fi = 0; fi < factor; ++fi)
	NewMask.push_back(Mask[i]);
	}
	else {
	for (unsigned fi = 0; fi < factor; ++fi)
	NewMask.push_back(Mask[i]*factor+fi);
	}
	}
	Mask = NewMask;
	VT = NewVT;
	}
	EltVT = NewEltVT;
	}
	unsigned NumElems = VT.getVectorNumElements();
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0; i != NumElems; ++i) {
	if (Mask[i] < 0) {
	Ops.push_back(DAG.getUNDEF(EltVT));
	continue;
	}
	unsigned Idx = Mask[i];
	if (Idx < NumElems)
	Ops.push_back(DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
	DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
	else
	Ops.push_back(DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op1,
	DAG.getConstant(Idx - NumElems, dl,
	TLI.getVectorIdxTy(DAG.getDataLayout()))));
	}

	Tmp1 = DAG.getBuildVector(VT, dl, Ops);
	// We may have changed the BUILD_VECTOR type. Cast it back to the Node type.
	Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), Tmp1);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::EXTRACT_ELEMENT: {
	EVT OpTy = Node->getOperand(0).getValueType();
	if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
	// 1 -> Hi
	Tmp1 = DAG.getNode(ISD::SRL, dl, OpTy, Node->getOperand(0),
	DAG.getConstant(OpTy.getSizeInBits() / 2, dl,
	TLI.getShiftAmountTy(
	Node->getOperand(0).getValueType(),
	DAG.getDataLayout())));
	Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Tmp1);
	} else {
	// 0 -> Lo
	Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0),
	Node->getOperand(0));
	}
	Results.push_back(Tmp1);
	break;
	}
	case ISD::STACKSAVE:
	// Expand to CopyFromReg if the target set
	// StackPointerRegisterToSaveRestore.
	if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
	Results.push_back(DAG.getCopyFromReg(Node->getOperand(0), dl, SP,
	Node->getValueType(0)));
	Results.push_back(Results[0].getValue(1));
	} else {
	Results.push_back(DAG.getUNDEF(Node->getValueType(0)));
	Results.push_back(Node->getOperand(0));
	}
	break;
	case ISD::STACKRESTORE:
	// Expand to CopyToReg if the target set
	// StackPointerRegisterToSaveRestore.
	if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
	Results.push_back(DAG.getCopyToReg(Node->getOperand(0), dl, SP,
	Node->getOperand(1)));
	} else {
	Results.push_back(Node->getOperand(0));
	}
	break;
	case ISD::GET_DYNAMIC_AREA_OFFSET:
	Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0)));
	Results.push_back(Results[0].getValue(0));
	break;
	case ISD::FCOPYSIGN:
	Results.push_back(ExpandFCOPYSIGN(Node));
	break;
	case ISD::FNEG:
	// Expand Y = FNEG(X) -> Y = SUB -0.0, X
	Tmp1 = DAG.getConstantFP(-0.0, dl, Node->getValueType(0));
	// TODO: If FNEG has fast-math-flags, propagate them to the FSUB.
	Tmp1 = DAG.getNode(ISD::FSUB, dl, Node->getValueType(0), Tmp1,
	Node->getOperand(0));
	Results.push_back(Tmp1);
	break;
	case ISD::FABS:
	Results.push_back(ExpandFABS(Node));
	break;
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX: {
	// Expand Y = MAX(A, B) -> Y = (A > B) ? A : B
	ISD::CondCode Pred;
	switch (Node->getOpcode()) {
	default: llvm_unreachable("How did we get here?");
	case ISD::SMAX: Pred = ISD::SETGT; break;
	case ISD::SMIN: Pred = ISD::SETLT; break;
	case ISD::UMAX: Pred = ISD::SETUGT; break;
	case ISD::UMIN: Pred = ISD::SETULT; break;
	}
	Tmp1 = Node->getOperand(0);
	Tmp2 = Node->getOperand(1);
	Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp1, Tmp2, Pred);
	Results.push_back(Tmp1);
	break;
	}

	case ISD::FSIN:
	case ISD::FCOS: {
	EVT VT = Node->getValueType(0);
	// Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin /
	// fcos which share the same operand and both are used.
	if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) \|\|
	isSinCosLibcallAvailable(Node, TLI))
	&& useSinCos(Node)) {
	SDVTList VTs = DAG.getVTList(VT, VT);
	Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0));
	if (Node->getOpcode() == ISD::FCOS)
	Tmp1 = Tmp1.getValue(1);
	Results.push_back(Tmp1);
	}
	break;
	}
	case ISD::FMAD:
	llvm_unreachable("Illegal fmad should never be formed");

	case ISD::FP16_TO_FP:
	if (Node->getValueType(0) != MVT::f32) {
	// We can extend to types bigger than f32 in two steps without changing
	// the result. Since "f16 -> f32" is much more commonly available, give
	// CodeGen the option of emitting that before resorting to a libcall.
	SDValue Res =
	DAG.getNode(ISD::FP16_TO_FP, dl, MVT::f32, Node->getOperand(0));
	Results.push_back(
	DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res));
	}
	break;
	case ISD::FP_TO_FP16:
	DEBUG(dbgs() << "Legalizing FP_TO_FP16\n");
	if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) {
	SDValue Op = Node->getOperand(0);
	MVT SVT = Op.getSimpleValueType();
	if ((SVT == MVT::f64 \|\| SVT == MVT::f80) &&
	TLI.isOperationLegalOrCustom(ISD::FP_TO_FP16, MVT::f32)) {
	// Under fastmath, we can expand this node into a fround followed by
	// a float-half conversion.
	SDValue FloatVal = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Op,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(
	DAG.getNode(ISD::FP_TO_FP16, dl, Node->getValueType(0), FloatVal));
	}
	}
	break;
	case ISD::ConstantFP: {
	ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Node);
	// Check to see if this FP immediate is already legal.
	// If this is a legal constant, turn it into a TargetConstantFP node.
	if (!TLI.isFPImmLegal(CFP->getValueAPF(), Node->getValueType(0)))
	Results.push_back(ExpandConstantFP(CFP, true));
	break;
	}
	case ISD::Constant: {
	ConstantSDNode *CP = cast<ConstantSDNode>(Node);
	Results.push_back(ExpandConstant(CP));
	break;
	}
	case ISD::FSUB: {
	EVT VT = Node->getValueType(0);
	if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
	TLI.isOperationLegalOrCustom(ISD::FNEG, VT)) {
	const SDNodeFlags Flags = Node->getFlags();
	Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1));
	Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1, Flags);
	Results.push_back(Tmp1);
	}
	break;
	}
	case ISD::SUB: {
	EVT VT = Node->getValueType(0);
	assert(TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
	TLI.isOperationLegalOrCustom(ISD::XOR, VT) &&
	"Don't know how to expand this subtraction!");
	Tmp1 = DAG.getNode(ISD::XOR, dl, VT, Node->getOperand(1),
	DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
	VT));
	Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp1, DAG.getConstant(1, dl, VT));
	Results.push_back(DAG.getNode(ISD::ADD, dl, VT, Node->getOperand(0), Tmp1));
	break;
	}
	case ISD::UREM:
	case ISD::SREM: {
	EVT VT = Node->getValueType(0);
	bool isSigned = Node->getOpcode() == ISD::SREM;
	unsigned DivOpc = isSigned ? ISD::SDIV : ISD::UDIV;
	unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
	Tmp2 = Node->getOperand(0);
	Tmp3 = Node->getOperand(1);
	if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
	SDVTList VTs = DAG.getVTList(VT, VT);
	Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
	Results.push_back(Tmp1);
	} else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
	// X % Y -> X-X/Y*Y
	Tmp1 = DAG.getNode(DivOpc, dl, VT, Tmp2, Tmp3);
	Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Tmp3);
	Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp1);
	Results.push_back(Tmp1);
	}
	break;
	}
	case ISD::UDIV:
	case ISD::SDIV: {
	bool isSigned = Node->getOpcode() == ISD::SDIV;
	unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
	EVT VT = Node->getValueType(0);
	if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
	SDVTList VTs = DAG.getVTList(VT, VT);
	Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0),
	Node->getOperand(1));
	Results.push_back(Tmp1);
	}
	break;
	}
	case ISD::MULHU:
	case ISD::MULHS: {
	unsigned ExpandOpcode =
	Node->getOpcode() == ISD::MULHU ? ISD::UMUL_LOHI : ISD::SMUL_LOHI;
	EVT VT = Node->getValueType(0);
	SDVTList VTs = DAG.getVTList(VT, VT);

	Tmp1 = DAG.getNode(ExpandOpcode, dl, VTs, Node->getOperand(0),
	Node->getOperand(1));
	Results.push_back(Tmp1.getValue(1));
	break;
	}
	case ISD::UMUL_LOHI:
	case ISD::SMUL_LOHI: {
	SDValue LHS = Node->getOperand(0);
	SDValue RHS = Node->getOperand(1);
	MVT VT = LHS.getSimpleValueType();
	unsigned MULHOpcode =
	Node->getOpcode() == ISD::UMUL_LOHI ? ISD::MULHU : ISD::MULHS;

	if (TLI.isOperationLegalOrCustom(MULHOpcode, VT)) {
	Results.push_back(DAG.getNode(ISD::MUL, dl, VT, LHS, RHS));
	Results.push_back(DAG.getNode(MULHOpcode, dl, VT, LHS, RHS));
	break;
	}

	SmallVector<SDValue, 4> Halves;
	EVT HalfType = EVT(VT).getHalfSizedIntegerVT(*DAG.getContext());
	assert(TLI.isTypeLegal(HalfType));
	if (TLI.expandMUL_LOHI(Node->getOpcode(), VT, Node, LHS, RHS, Halves,
	HalfType, DAG,
	TargetLowering::MulExpansionKind::Always)) {
	for (unsigned i = 0; i < 2; ++i) {
	SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Halves[2 * i]);
	SDValue Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Halves[2 * i + 1]);
	SDValue Shift = DAG.getConstant(
	HalfType.getScalarSizeInBits(), dl,
	TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
	Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
	Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
	}
	break;
	}
	break;
	}
	case ISD::MUL: {
	EVT VT = Node->getValueType(0);
	SDVTList VTs = DAG.getVTList(VT, VT);
	// See if multiply or divide can be lowered using two-result operations.
	// We just need the low half of the multiply; try both the signed
	// and unsigned forms. If the target supports both SMUL_LOHI and
	// UMUL_LOHI, form a preference by checking which forms of plain
	// MULH it supports.
	bool HasSMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::SMUL_LOHI, VT);
	bool HasUMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, VT);
	bool HasMULHS = TLI.isOperationLegalOrCustom(ISD::MULHS, VT);
	bool HasMULHU = TLI.isOperationLegalOrCustom(ISD::MULHU, VT);
	unsigned OpToUse = 0;
	if (HasSMUL_LOHI && !HasMULHS) {
	OpToUse = ISD::SMUL_LOHI;
	} else if (HasUMUL_LOHI && !HasMULHU) {
	OpToUse = ISD::UMUL_LOHI;
	} else if (HasSMUL_LOHI) {
	OpToUse = ISD::SMUL_LOHI;
	} else if (HasUMUL_LOHI) {
	OpToUse = ISD::UMUL_LOHI;
	}
	if (OpToUse) {
	Results.push_back(DAG.getNode(OpToUse, dl, VTs, Node->getOperand(0),
	Node->getOperand(1)));
	break;
	}

	SDValue Lo, Hi;
	EVT HalfType = VT.getHalfSizedIntegerVT(*DAG.getContext());
	if (TLI.isOperationLegalOrCustom(ISD::ZERO_EXTEND, VT) &&
	TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND, VT) &&
	TLI.isOperationLegalOrCustom(ISD::SHL, VT) &&
	TLI.isOperationLegalOrCustom(ISD::OR, VT) &&
	TLI.expandMUL(Node, Lo, Hi, HalfType, DAG,
	TargetLowering::MulExpansionKind::OnlyLegalOrCustom)) {
	Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
	Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Hi);
	SDValue Shift =
	DAG.getConstant(HalfType.getSizeInBits(), dl,
	TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
	Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
	Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
	}
	break;
	}
	case ISD::SADDO:
	case ISD::SSUBO: {
	SDValue LHS = Node->getOperand(0);
	SDValue RHS = Node->getOperand(1);
	SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
	ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
	LHS, RHS);
	Results.push_back(Sum);
	EVT ResultType = Node->getValueType(1);
	EVT OType = getSetCCResultType(Node->getValueType(0));

	SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());

	// LHSSign -> LHS >= 0
	// RHSSign -> RHS >= 0
	// SumSign -> Sum >= 0
	//
	// Add:
	// Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
	// Sub:
	// Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
	SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
	SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
	SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
	Node->getOpcode() == ISD::SADDO ?
	ISD::SETEQ : ISD::SETNE);

	SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
	SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);

	SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
	Results.push_back(DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType));
	break;
	}
	case ISD::UADDO:
	case ISD::USUBO: {
	SDValue LHS = Node->getOperand(0);
	SDValue RHS = Node->getOperand(1);
	SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::UADDO ?
	ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
	LHS, RHS);
	Results.push_back(Sum);

	EVT ResultType = Node->getValueType(1);
	EVT SetCCType = getSetCCResultType(Node->getValueType(0));
	ISD::CondCode CC
	= Node->getOpcode() == ISD::UADDO ? ISD::SETULT : ISD::SETUGT;
	SDValue SetCC = DAG.getSetCC(dl, SetCCType, Sum, LHS, CC);

	Results.push_back(DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType));
	break;
	}
	case ISD::UMULO:
	case ISD::SMULO: {
	EVT VT = Node->getValueType(0);
	EVT WideVT = EVT::getIntegerVT(DAG.getContext(), VT.getSizeInBits() 2);
	SDValue LHS = Node->getOperand(0);
	SDValue RHS = Node->getOperand(1);
	SDValue BottomHalf;
	SDValue TopHalf;
	static const unsigned Ops[2][3] =
	{ { ISD::MULHU, ISD::UMUL_LOHI, ISD::ZERO_EXTEND },
	{ ISD::MULHS, ISD::SMUL_LOHI, ISD::SIGN_EXTEND }};
	bool isSigned = Node->getOpcode() == ISD::SMULO;
	if (TLI.isOperationLegalOrCustom(Ops[isSigned][0], VT)) {
	BottomHalf = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
	TopHalf = DAG.getNode(Ops[isSigned][0], dl, VT, LHS, RHS);
	} else if (TLI.isOperationLegalOrCustom(Ops[isSigned][1], VT)) {
	BottomHalf = DAG.getNode(Ops[isSigned][1], dl, DAG.getVTList(VT, VT), LHS,
	RHS);
	TopHalf = BottomHalf.getValue(1);
	} else if (TLI.isTypeLegal(WideVT)) {
	LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS);
	RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS);
	Tmp1 = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
	BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Tmp1,
	DAG.getIntPtrConstant(0, dl));
	TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Tmp1,
	DAG.getIntPtrConstant(1, dl));
	} else {
	// We can fall back to a libcall with an illegal type for the MUL if we
	// have a libcall big enough.
	// Also, we can fall back to a division in some cases, but that's a big
	// performance hit in the general case.
	RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
	if (WideVT == MVT::i16)
	LC = RTLIB::MUL_I16;
	else if (WideVT == MVT::i32)
	LC = RTLIB::MUL_I32;
	else if (WideVT == MVT::i64)
	LC = RTLIB::MUL_I64;
	else if (WideVT == MVT::i128)
	LC = RTLIB::MUL_I128;
	assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");

	SDValue HiLHS;
	SDValue HiRHS;
	if (isSigned) {
	// The high part is obtained by SRA'ing all but one of the bits of low
	// part.
	unsigned LoSize = VT.getSizeInBits();
	HiLHS =
	DAG.getNode(ISD::SRA, dl, VT, LHS,
	DAG.getConstant(LoSize - 1, dl,
	TLI.getPointerTy(DAG.getDataLayout())));
	HiRHS =
	DAG.getNode(ISD::SRA, dl, VT, RHS,
	DAG.getConstant(LoSize - 1, dl,
	TLI.getPointerTy(DAG.getDataLayout())));
	} else {
	HiLHS = DAG.getConstant(0, dl, VT);
	HiRHS = DAG.getConstant(0, dl, VT);
	}

	// Here we're passing the 2 arguments explicitly as 4 arguments that are
	// pre-lowered to the correct types. This all depends upon WideVT not
	// being a legal type for the architecture and thus has to be split to
	// two arguments.
	SDValue Ret;
	if(DAG.getDataLayout().isLittleEndian()) {
	// Halves of WideVT are packed into registers in different order
	// depending on platform endianness. This is usually handled by
	// the C calling convention, but we can't defer to it in
	// the legalizer.
	SDValue Args[] = { LHS, HiLHS, RHS, HiRHS };
	Ret = ExpandLibCall(LC, WideVT, Args, 4, isSigned, dl);
	} else {
	SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
	Ret = ExpandLibCall(LC, WideVT, Args, 4, isSigned, dl);
	}
	assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
	"Ret value is a collection of constituent nodes holding result.");
	BottomHalf = Ret.getOperand(0);
	TopHalf = Ret.getOperand(1);
	}

	if (isSigned) {
	Tmp1 = DAG.getConstant(
	VT.getSizeInBits() - 1, dl,
	TLI.getShiftAmountTy(BottomHalf.getValueType(), DAG.getDataLayout()));
	Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, Tmp1);
	TopHalf = DAG.getSetCC(dl, getSetCCResultType(VT), TopHalf, Tmp1,
	ISD::SETNE);
	} else {
	TopHalf = DAG.getSetCC(dl, getSetCCResultType(VT), TopHalf,
	DAG.getConstant(0, dl, VT), ISD::SETNE);
	}

	// Truncate the result if SetCC returns a larger type than needed.
	EVT RType = Node->getValueType(1);
	if (RType.getSizeInBits() < TopHalf.getValueSizeInBits())
	TopHalf = DAG.getNode(ISD::TRUNCATE, dl, RType, TopHalf);

	assert(RType.getSizeInBits() == TopHalf.getValueSizeInBits() &&
	"Unexpected result type for S/UMULO legalization");

	Results.push_back(BottomHalf);
	Results.push_back(TopHalf);
	break;
	}
	case ISD::BUILD_PAIR: {
	EVT PairTy = Node->getValueType(0);
	Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, PairTy, Node->getOperand(0));
	Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, PairTy, Node->getOperand(1));
	Tmp2 = DAG.getNode(
	ISD::SHL, dl, PairTy, Tmp2,
	DAG.getConstant(PairTy.getSizeInBits() / 2, dl,
	TLI.getShiftAmountTy(PairTy, DAG.getDataLayout())));
	Results.push_back(DAG.getNode(ISD::OR, dl, PairTy, Tmp1, Tmp2));
	break;
	}
	case ISD::SELECT:
	Tmp1 = Node->getOperand(0);
	Tmp2 = Node->getOperand(1);
	Tmp3 = Node->getOperand(2);
	if (Tmp1.getOpcode() == ISD::SETCC) {
	Tmp1 = DAG.getSelectCC(dl, Tmp1.getOperand(0), Tmp1.getOperand(1),
	Tmp2, Tmp3,
	cast<CondCodeSDNode>(Tmp1.getOperand(2))->get());
	} else {
	Tmp1 = DAG.getSelectCC(dl, Tmp1,
	DAG.getConstant(0, dl, Tmp1.getValueType()),
	Tmp2, Tmp3, ISD::SETNE);
	}
	Results.push_back(Tmp1);
	break;
	case ISD::BR_JT: {
	SDValue Chain = Node->getOperand(0);
	SDValue Table = Node->getOperand(1);
	SDValue Index = Node->getOperand(2);

	const DataLayout &TD = DAG.getDataLayout();
	EVT PTy = TLI.getPointerTy(TD);

	unsigned EntrySize =
	DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);

	Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
	DAG.getConstant(EntrySize, dl, Index.getValueType()));
	SDValue Addr = DAG.getNode(ISD::ADD, dl, Index.getValueType(),
	Index, Table);

	EVT MemVT = EVT::getIntegerVT(DAG.getContext(), EntrySize 8);
	SDValue LD = DAG.getExtLoad(
	ISD::SEXTLOAD, dl, PTy, Chain, Addr,
	MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), MemVT);
	Addr = LD;
	if (TLI.isJumpTableRelative()) {
	// For PIC, the sequence is:
	// BRIND(load(Jumptable + index) + RelocBase)
	// RelocBase can be JumpTable, GOT or some sort of global base.
	Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr,
	TLI.getPICJumpTableRelocBase(Table, DAG));
	}
	Tmp1 = DAG.getNode(ISD::BRIND, dl, MVT::Other, LD.getValue(1), Addr);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::BRCOND:
	// Expand brcond's setcc into its constituent parts and create a BR_CC
	// Node.
	Tmp1 = Node->getOperand(0);
	Tmp2 = Node->getOperand(1);
	if (Tmp2.getOpcode() == ISD::SETCC) {
	Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other,
	Tmp1, Tmp2.getOperand(2),
	Tmp2.getOperand(0), Tmp2.getOperand(1),
	Node->getOperand(2));
	} else {
	// We test only the i1 bit. Skip the AND if UNDEF or another AND.
	if (Tmp2.isUndef() \|\|
	(Tmp2.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(Tmp2.getOperand(1)) &&
	dyn_cast<ConstantSDNode>(Tmp2.getOperand(1))->getZExtValue() == 1))
	Tmp3 = Tmp2;
	else
	Tmp3 = DAG.getNode(ISD::AND, dl, Tmp2.getValueType(), Tmp2,
	DAG.getConstant(1, dl, Tmp2.getValueType()));
	Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1,
	DAG.getCondCode(ISD::SETNE), Tmp3,
	DAG.getConstant(0, dl, Tmp3.getValueType()),
	Node->getOperand(2));
	}
	Results.push_back(Tmp1);
	break;
	case ISD::SETCC: {
	Tmp1 = Node->getOperand(0);
	Tmp2 = Node->getOperand(1);
	Tmp3 = Node->getOperand(2);
	bool Legalized = LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2,
	Tmp3, NeedInvert, dl);

	if (Legalized) {
	// If we expanded the SETCC by swapping LHS and RHS, or by inverting the
	// condition code, create a new SETCC node.
	if (Tmp3.getNode())
	Tmp1 = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
	Tmp1, Tmp2, Tmp3);

	// If we expanded the SETCC by inverting the condition code, then wrap
	// the existing SETCC in a NOT to restore the intended condition.
	if (NeedInvert)
	Tmp1 = DAG.getLogicalNOT(dl, Tmp1, Tmp1->getValueType(0));

	Results.push_back(Tmp1);
	break;
	}

	// Otherwise, SETCC for the given comparison type must be completely
	// illegal; expand it into a SELECT_CC.
	EVT VT = Node->getValueType(0);
	int TrueValue;
	switch (TLI.getBooleanContents(Tmp1->getValueType(0))) {
	case TargetLowering::ZeroOrOneBooleanContent:
	case TargetLowering::UndefinedBooleanContent:
	TrueValue = 1;
	break;
	case TargetLowering::ZeroOrNegativeOneBooleanContent:
	TrueValue = -1;
	break;
	}
	Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, VT, Tmp1, Tmp2,
	DAG.getConstant(TrueValue, dl, VT),
	DAG.getConstant(0, dl, VT),
	Tmp3);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::SELECT_CC: {
	Tmp1 = Node->getOperand(0); // LHS
	Tmp2 = Node->getOperand(1); // RHS
	Tmp3 = Node->getOperand(2); // True
	Tmp4 = Node->getOperand(3); // False
	EVT VT = Node->getValueType(0);
	SDValue CC = Node->getOperand(4);
	ISD::CondCode CCOp = cast<CondCodeSDNode>(CC)->get();

	if (TLI.isCondCodeLegal(CCOp, Tmp1.getSimpleValueType())) {
	// If the condition code is legal, then we need to expand this
	// node using SETCC and SELECT.
	EVT CmpVT = Tmp1.getValueType();
	assert(!TLI.isOperationExpand(ISD::SELECT, VT) &&
	"Cannot expand ISD::SELECT_CC when ISD::SELECT also needs to be "
	"expanded.");
	EVT CCVT =
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
	SDValue Cond = DAG.getNode(ISD::SETCC, dl, CCVT, Tmp1, Tmp2, CC);
	Results.push_back(DAG.getSelect(dl, VT, Cond, Tmp3, Tmp4));
	break;
	}

	// SELECT_CC is legal, so the condition code must not be.
	bool Legalized = false;
	// Try to legalize by inverting the condition. This is for targets that
	// might support an ordered version of a condition, but not the unordered
	// version (or vice versa).
	ISD::CondCode InvCC = ISD::getSetCCInverse(CCOp,
	Tmp1.getValueType().isInteger());
	if (TLI.isCondCodeLegal(InvCC, Tmp1.getSimpleValueType())) {
	// Use the new condition code and swap true and false
	Legalized = true;
	Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC);
	} else {
	// If The inverse is not legal, then try to swap the arguments using
	// the inverse condition code.
	ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InvCC);
	if (TLI.isCondCodeLegal(SwapInvCC, Tmp1.getSimpleValueType())) {
	// The swapped inverse condition is legal, so swap true and false,
	// lhs and rhs.
	Legalized = true;
	Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC);
	}
	}

	if (!Legalized) {
	Legalized = LegalizeSetCCCondCode(
	getSetCCResultType(Tmp1.getValueType()), Tmp1, Tmp2, CC, NeedInvert,
	dl);

	assert(Legalized && "Can't legalize SELECT_CC with legal condition!");

	// If we expanded the SETCC by inverting the condition code, then swap
	// the True/False operands to match.
	if (NeedInvert)
	std::swap(Tmp3, Tmp4);

	// If we expanded the SETCC by swapping LHS and RHS, or by inverting the
	// condition code, create a new SELECT_CC node.
	if (CC.getNode()) {
	Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0),
	Tmp1, Tmp2, Tmp3, Tmp4, CC);
	} else {
	Tmp2 = DAG.getConstant(0, dl, Tmp1.getValueType());
	CC = DAG.getCondCode(ISD::SETNE);
	Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1,
	Tmp2, Tmp3, Tmp4, CC);
	}
	}
	Results.push_back(Tmp1);
	break;
	}
	case ISD::BR_CC: {
	Tmp1 = Node->getOperand(0); // Chain
	Tmp2 = Node->getOperand(2); // LHS
	Tmp3 = Node->getOperand(3); // RHS
	Tmp4 = Node->getOperand(1); // CC

	bool Legalized = LegalizeSetCCCondCode(getSetCCResultType(
	Tmp2.getValueType()), Tmp2, Tmp3, Tmp4, NeedInvert, dl);
	(void)Legalized;
	assert(Legalized && "Can't legalize BR_CC with legal condition!");

	// If we expanded the SETCC by inverting the condition code, then wrap
	// the existing SETCC in a NOT to restore the intended condition.
	if (NeedInvert)
	Tmp4 = DAG.getNOT(dl, Tmp4, Tmp4->getValueType(0));

	// If we expanded the SETCC by swapping LHS and RHS, create a new BR_CC
	// node.
	if (Tmp4.getNode()) {
	Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1,
	Tmp4, Tmp2, Tmp3, Node->getOperand(4));
	} else {
	Tmp3 = DAG.getConstant(0, dl, Tmp2.getValueType());
	Tmp4 = DAG.getCondCode(ISD::SETNE);
	Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4,
	Tmp2, Tmp3, Node->getOperand(4));
	}
	Results.push_back(Tmp1);
	break;
	}
	case ISD::BUILD_VECTOR:
	Results.push_back(ExpandBUILD_VECTOR(Node));
	break;
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL: {
	// Scalarize vector SRA/SRL/SHL.
	EVT VT = Node->getValueType(0);
	assert(VT.isVector() && "Unable to legalize non-vector shift");
	assert(TLI.isTypeLegal(VT.getScalarType())&& "Element type must be legal");
	unsigned NumElem = VT.getVectorNumElements();

	SmallVector<SDValue, 8> Scalars;
	for (unsigned Idx = 0; Idx < NumElem; Idx++) {
	SDValue Ex = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(), Node->getOperand(0),
	DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	SDValue Sh = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(), Node->getOperand(1),
	DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	Scalars.push_back(DAG.getNode(Node->getOpcode(), dl,
	VT.getScalarType(), Ex, Sh));
	}

	SDValue Result = DAG.getBuildVector(Node->getValueType(0), dl, Scalars);
	ReplaceNode(SDValue(Node, 0), Result);
	break;
	}
	case ISD::GLOBAL_OFFSET_TABLE:
	case ISD::GlobalAddress:
	case ISD::GlobalTLSAddress:
	case ISD::ExternalSymbol:
	case ISD::ConstantPool:
	case ISD::JumpTable:
	case ISD::INTRINSIC_W_CHAIN:
	case ISD::INTRINSIC_WO_CHAIN:
	case ISD::INTRINSIC_VOID:
	// FIXME: Custom lowering for these operations shouldn't return null!
	break;
	}

	// Replace the original node with the legalized result.
	if (Results.empty()) {
	DEBUG(dbgs() << "Cannot expand node\n");
	return false;
	}

	DEBUG(dbgs() << "Succesfully expanded node\n");
	ReplaceNode(Node, Results.data());
	return true;
	}

	void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
	DEBUG(dbgs() << "Trying to convert node to libcall\n");
	SmallVector<SDValue, 8> Results;
	SDLoc dl(Node);
	unsigned Opc = Node->getOpcode();
	switch (Opc) {
	case ISD::ATOMIC_FENCE: {
	// If the target didn't lower this, lower it to '__sync_synchronize()' call
	// FIXME: handle "fence singlethread" more efficiently.
	TargetLowering::ArgListTy Args;

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(Node->getOperand(0))
	.setLibCallee(
	CallingConv::C, Type::getVoidTy(*DAG.getContext()),
	DAG.getExternalSymbol("__sync_synchronize",
	TLI.getPointerTy(DAG.getDataLayout())),
	std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	Results.push_back(CallResult.second);
	break;
	}
	// By default, atomic intrinsics are marked Legal and lowered. Targets
	// which don't support them directly, however, may want libcalls, in which
	// case they mark them Expand, and we get here.
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	case ISD::ATOMIC_CMP_SWAP: {
	MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
	RTLIB::Libcall LC = RTLIB::getSYNC(Opc, VT);
	assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!");

	std::pair<SDValue, SDValue> Tmp = ExpandChainLibCall(LC, Node, false);
	Results.push_back(Tmp.first);
	Results.push_back(Tmp.second);
	break;
	}
	case ISD::TRAP: {
	// If this operation is not supported, lower it to 'abort()' call
	TargetLowering::ArgListTy Args;
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(Node->getOperand(0))
	.setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
	DAG.getExternalSymbol(
	"abort", TLI.getPointerTy(DAG.getDataLayout())),
	std::move(Args));
	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	Results.push_back(CallResult.second);
	break;
	}
	case ISD::FMINNUM:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::FMIN_F32, RTLIB::FMIN_F64,
	RTLIB::FMIN_F80, RTLIB::FMIN_F128,
	RTLIB::FMIN_PPCF128));
	break;
	case ISD::FMAXNUM:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64,
	RTLIB::FMAX_F80, RTLIB::FMAX_F128,
	RTLIB::FMAX_PPCF128));
	break;
	case ISD::FSQRT:
	case ISD::STRICT_FSQRT:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
	RTLIB::SQRT_F80, RTLIB::SQRT_F128,
	RTLIB::SQRT_PPCF128));
	break;
	case ISD::FSIN:
	case ISD::STRICT_FSIN:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
	RTLIB::SIN_F80, RTLIB::SIN_F128,
	RTLIB::SIN_PPCF128));
	break;
	case ISD::FCOS:
	case ISD::STRICT_FCOS:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
	RTLIB::COS_F80, RTLIB::COS_F128,
	RTLIB::COS_PPCF128));
	break;
	case ISD::FSINCOS:
	// Expand into sincos libcall.
	ExpandSinCosLibCall(Node, Results);
	break;
	case ISD::FLOG:
	case ISD::STRICT_FLOG:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
	RTLIB::LOG_F80, RTLIB::LOG_F128,
	RTLIB::LOG_PPCF128));
	break;
	case ISD::FLOG2:
	case ISD::STRICT_FLOG2:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
	RTLIB::LOG2_F80, RTLIB::LOG2_F128,
	RTLIB::LOG2_PPCF128));
	break;
	case ISD::FLOG10:
	case ISD::STRICT_FLOG10:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
	RTLIB::LOG10_F80, RTLIB::LOG10_F128,
	RTLIB::LOG10_PPCF128));
	break;
	case ISD::FEXP:
	case ISD::STRICT_FEXP:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
	RTLIB::EXP_F80, RTLIB::EXP_F128,
	RTLIB::EXP_PPCF128));
	break;
	case ISD::FEXP2:
	case ISD::STRICT_FEXP2:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
	RTLIB::EXP2_F80, RTLIB::EXP2_F128,
	RTLIB::EXP2_PPCF128));
	break;
	case ISD::FTRUNC:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
	RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
	RTLIB::TRUNC_PPCF128));
	break;
	case ISD::FFLOOR:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
	RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
	RTLIB::FLOOR_PPCF128));
	break;
	case ISD::FCEIL:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64,
	RTLIB::CEIL_F80, RTLIB::CEIL_F128,
	RTLIB::CEIL_PPCF128));
	break;
	case ISD::FRINT:
	case ISD::STRICT_FRINT:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64,
	RTLIB::RINT_F80, RTLIB::RINT_F128,
	RTLIB::RINT_PPCF128));
	break;
	case ISD::FNEARBYINT:
	case ISD::STRICT_FNEARBYINT:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32,
	RTLIB::NEARBYINT_F64,
	RTLIB::NEARBYINT_F80,
	RTLIB::NEARBYINT_F128,
	RTLIB::NEARBYINT_PPCF128));
	break;
	case ISD::FROUND:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::ROUND_F32,
	RTLIB::ROUND_F64,
	RTLIB::ROUND_F80,
	RTLIB::ROUND_F128,
	RTLIB::ROUND_PPCF128));
	break;
	case ISD::FPOWI:
	case ISD::STRICT_FPOWI:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
	RTLIB::POWI_F80, RTLIB::POWI_F128,
	RTLIB::POWI_PPCF128));
	break;
	case ISD::FPOW:
	case ISD::STRICT_FPOW:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
	RTLIB::POW_F80, RTLIB::POW_F128,
	RTLIB::POW_PPCF128));
	break;
	case ISD::FDIV:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
	RTLIB::DIV_F80, RTLIB::DIV_F128,
	RTLIB::DIV_PPCF128));
	break;
	case ISD::FREM:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64,
	RTLIB::REM_F80, RTLIB::REM_F128,
	RTLIB::REM_PPCF128));
	break;
	case ISD::FMA:
	case ISD::STRICT_FMA:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64,
	RTLIB::FMA_F80, RTLIB::FMA_F128,
	RTLIB::FMA_PPCF128));
	break;
	case ISD::FADD:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
	RTLIB::ADD_F80, RTLIB::ADD_F128,
	RTLIB::ADD_PPCF128));
	break;
	case ISD::FMUL:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::MUL_F32, RTLIB::MUL_F64,
	RTLIB::MUL_F80, RTLIB::MUL_F128,
	RTLIB::MUL_PPCF128));
	break;
	case ISD::FP16_TO_FP:
	if (Node->getValueType(0) == MVT::f32) {
	Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false));
	}
	break;
	case ISD::FP_TO_FP16: {
	RTLIB::Libcall LC =
	RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16);
	assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16");
	Results.push_back(ExpandLibCall(LC, Node, false));
	break;
	}
	case ISD::FSUB:
	Results.push_back(ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64,
	RTLIB::SUB_F80, RTLIB::SUB_F128,
	RTLIB::SUB_PPCF128));
	break;
	case ISD::SREM:
	Results.push_back(ExpandIntLibCall(Node, true,
	RTLIB::SREM_I8,
	RTLIB::SREM_I16, RTLIB::SREM_I32,
	RTLIB::SREM_I64, RTLIB::SREM_I128));
	break;
	case ISD::UREM:
	Results.push_back(ExpandIntLibCall(Node, false,
	RTLIB::UREM_I8,
	RTLIB::UREM_I16, RTLIB::UREM_I32,
	RTLIB::UREM_I64, RTLIB::UREM_I128));
	break;
	case ISD::SDIV:
	Results.push_back(ExpandIntLibCall(Node, true,
	RTLIB::SDIV_I8,
	RTLIB::SDIV_I16, RTLIB::SDIV_I32,
	RTLIB::SDIV_I64, RTLIB::SDIV_I128));
	break;
	case ISD::UDIV:
	Results.push_back(ExpandIntLibCall(Node, false,
	RTLIB::UDIV_I8,
	RTLIB::UDIV_I16, RTLIB::UDIV_I32,
	RTLIB::UDIV_I64, RTLIB::UDIV_I128));
	break;
	case ISD::SDIVREM:
	case ISD::UDIVREM:
	// Expand into divrem libcall
	ExpandDivRemLibCall(Node, Results);
	break;
	case ISD::MUL:
	Results.push_back(ExpandIntLibCall(Node, false,
	RTLIB::MUL_I8,
	RTLIB::MUL_I16, RTLIB::MUL_I32,
	RTLIB::MUL_I64, RTLIB::MUL_I128));
	break;
	}

	// Replace the original node with the legalized result.
	if (!Results.empty()) {
	DEBUG(dbgs() << "Successfully converted node to libcall\n");
	ReplaceNode(Node, Results.data());
	} else
	DEBUG(dbgs() << "Could not convert node to libcall\n");
	}

	// Determine the vector type to use in place of an original scalar element when
	// promoting equally sized vectors.
	static MVT getPromotedVectorElementType(const TargetLowering &TLI,
	MVT EltVT, MVT NewEltVT) {
	unsigned OldEltsPerNewElt = EltVT.getSizeInBits() / NewEltVT.getSizeInBits();
	MVT MidVT = MVT::getVectorVT(NewEltVT, OldEltsPerNewElt);
	assert(TLI.isTypeLegal(MidVT) && "unexpected");
	return MidVT;
	}

	void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
	DEBUG(dbgs() << "Trying to promote node\n");
	SmallVector<SDValue, 8> Results;
	MVT OVT = Node->getSimpleValueType(0);
	if (Node->getOpcode() == ISD::UINT_TO_FP \|\|
	Node->getOpcode() == ISD::SINT_TO_FP \|\|
	Node->getOpcode() == ISD::SETCC \|\|
	Node->getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	Node->getOpcode() == ISD::INSERT_VECTOR_ELT) {
	OVT = Node->getOperand(0).getSimpleValueType();
	}
	if (Node->getOpcode() == ISD::BR_CC)
	OVT = Node->getOperand(2).getSimpleValueType();
	MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), OVT);
	SDLoc dl(Node);
	SDValue Tmp1, Tmp2, Tmp3;
	switch (Node->getOpcode()) {
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	case ISD::CTPOP:
	// Zero extend the argument.
	Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
	if (Node->getOpcode() == ISD::CTTZ) {
	// The count is the same in the promoted type except if the original
	// value was zero. This can be handled by setting the bit just off
	// the top of the original type.
	auto TopBit = APInt::getOneBitSet(NVT.getSizeInBits(),
	OVT.getSizeInBits());
	Tmp1 = DAG.getNode(ISD::OR, dl, NVT, Tmp1,
	DAG.getConstant(TopBit, dl, NVT));
	}
	// Perform the larger operation. For CTPOP and CTTZ_ZERO_UNDEF, this is
	// already the correct result.
	Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
	if (Node->getOpcode() == ISD::CTLZ \|\|
	Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
	// Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
	Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
	DAG.getConstant(NVT.getSizeInBits() -
	OVT.getSizeInBits(), dl, NVT));
	}
	Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
	break;
	case ISD::BITREVERSE:
	case ISD::BSWAP: {
	unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
	Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
	Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
	Tmp1 = DAG.getNode(
	ISD::SRL, dl, NVT, Tmp1,
	DAG.getConstant(DiffBits, dl,
	TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
	Results.push_back(Tmp1);
	break;
	}
	case ISD::FP_TO_UINT:
	case ISD::FP_TO_SINT:
	Tmp1 = PromoteLegalFP_TO_INT(Node->getOperand(0), Node->getValueType(0),
	Node->getOpcode() == ISD::FP_TO_SINT, dl);
	Results.push_back(Tmp1);
	break;
	case ISD::UINT_TO_FP:
	case ISD::SINT_TO_FP:
	Tmp1 = PromoteLegalINT_TO_FP(Node->getOperand(0), Node->getValueType(0),
	Node->getOpcode() == ISD::SINT_TO_FP, dl);
	Results.push_back(Tmp1);
	break;
	case ISD::VAARG: {
	SDValue Chain = Node->getOperand(0); // Get the chain.
	SDValue Ptr = Node->getOperand(1); // Get the pointer.

	unsigned TruncOp;
	if (OVT.isVector()) {
	TruncOp = ISD::BITCAST;
	} else {
	assert(OVT.isInteger()
	&& "VAARG promotion is supported only for vectors or integer types");
	TruncOp = ISD::TRUNCATE;
	}

	// Perform the larger operation, then convert back
	Tmp1 = DAG.getVAArg(NVT, dl, Chain, Ptr, Node->getOperand(2),
	Node->getConstantOperandVal(3));
	Chain = Tmp1.getValue(1);

	Tmp2 = DAG.getNode(TruncOp, dl, OVT, Tmp1);

	// Modified the chain result - switch anything that used the old chain to
	// use the new one.
	DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp2);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
	if (UpdatedNodes) {
	UpdatedNodes->insert(Tmp2.getNode());
	UpdatedNodes->insert(Chain.getNode());
	}
	ReplacedNode(Node);
	break;
	}
	case ISD::MUL:
	case ISD::SDIV:
	case ISD::SREM:
	case ISD::UDIV:
	case ISD::UREM:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR: {
	unsigned ExtOp, TruncOp;
	if (OVT.isVector()) {
	ExtOp = ISD::BITCAST;
	TruncOp = ISD::BITCAST;
	} else {
	assert(OVT.isInteger() && "Cannot promote logic operation");

	switch (Node->getOpcode()) {
	default:
	ExtOp = ISD::ANY_EXTEND;
	break;
	case ISD::SDIV:
	case ISD::SREM:
	ExtOp = ISD::SIGN_EXTEND;
	break;
	case ISD::UDIV:
	case ISD::UREM:
	ExtOp = ISD::ZERO_EXTEND;
	break;
	}
	TruncOp = ISD::TRUNCATE;
	}
	// Promote each of the values to the new type.
	Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
	// Perform the larger operation, then convert back
	Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
	Results.push_back(DAG.getNode(TruncOp, dl, OVT, Tmp1));
	break;
	}
	case ISD::UMUL_LOHI:
	case ISD::SMUL_LOHI: {
	// Promote to a multiply in a wider integer type.
	unsigned ExtOp = Node->getOpcode() == ISD::UMUL_LOHI ? ISD::ZERO_EXTEND
	: ISD::SIGN_EXTEND;
	Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
	Tmp1 = DAG.getNode(ISD::MUL, dl, NVT, Tmp1, Tmp2);

	auto &DL = DAG.getDataLayout();
	unsigned OriginalSize = OVT.getScalarSizeInBits();
	Tmp2 = DAG.getNode(
	ISD::SRL, dl, NVT, Tmp1,
	DAG.getConstant(OriginalSize, dl, TLI.getScalarShiftAmountTy(DL, NVT)));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
	Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp2));
	break;
	}
	case ISD::SELECT: {
	unsigned ExtOp, TruncOp;
	if (Node->getValueType(0).isVector() \|\|
	Node->getValueType(0).getSizeInBits() == NVT.getSizeInBits()) {
	ExtOp = ISD::BITCAST;
	TruncOp = ISD::BITCAST;
	} else if (Node->getValueType(0).isInteger()) {
	ExtOp = ISD::ANY_EXTEND;
	TruncOp = ISD::TRUNCATE;
	} else {
	ExtOp = ISD::FP_EXTEND;
	TruncOp = ISD::FP_ROUND;
	}
	Tmp1 = Node->getOperand(0);
	// Promote each of the values to the new type.
	Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
	Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
	// Perform the larger operation, then round down.
	Tmp1 = DAG.getSelect(dl, NVT, Tmp1, Tmp2, Tmp3);
	if (TruncOp != ISD::FP_ROUND)
	Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1);
	else
	Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Tmp1);
	break;
	}
	case ISD::VECTOR_SHUFFLE: {
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Node)->getMask();

	// Cast the two input vectors.
	Tmp1 = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(1));

	// Convert the shuffle mask to the right # elements.
	Tmp1 = ShuffleWithNarrowerEltType(NVT, OVT, dl, Tmp1, Tmp2, Mask);
	Tmp1 = DAG.getNode(ISD::BITCAST, dl, OVT, Tmp1);
	Results.push_back(Tmp1);
	break;
	}
	case ISD::SETCC: {
	unsigned ExtOp = ISD::FP_EXTEND;
	if (NVT.isInteger()) {
	ISD::CondCode CCCode =
	cast<CondCodeSDNode>(Node->getOperand(2))->get();
	ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	}
	Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
	Results.push_back(DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
	Tmp1, Tmp2, Node->getOperand(2)));
	break;
	}
	case ISD::BR_CC: {
	unsigned ExtOp = ISD::FP_EXTEND;
	if (NVT.isInteger()) {
	ISD::CondCode CCCode =
	cast<CondCodeSDNode>(Node->getOperand(1))->get();
	ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	}
	Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
	Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(3));
	Results.push_back(DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0),
	Node->getOperand(0), Node->getOperand(1),
	Tmp1, Tmp2, Node->getOperand(4)));
	break;
	}
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FPOW:
	Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
	Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2,
	Node->getFlags());
	Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
	Tmp3, DAG.getIntPtrConstant(0, dl)));
	break;
	case ISD::FMA:
	Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
	Tmp3 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(2));
	Results.push_back(
	DAG.getNode(ISD::FP_ROUND, dl, OVT,
	DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, Tmp3),
	DAG.getIntPtrConstant(0, dl)));
	break;
	case ISD::FCOPYSIGN:
	case ISD::FPOWI: {
	Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
	Tmp2 = Node->getOperand(1);
	Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);

	// fcopysign doesn't change anything but the sign bit, so
	// (fp_round (fcopysign (fpext a), b))
	// is as precise as
	// (fp_round (fpext a))
	// which is a no-op. Mark it as a TRUNCating FP_ROUND.
	const bool isTrunc = (Node->getOpcode() == ISD::FCOPYSIGN);
	Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
	Tmp3, DAG.getIntPtrConstant(isTrunc, dl)));
	break;
	}
	case ISD::FFLOOR:
	case ISD::FCEIL:
	case ISD::FRINT:
	case ISD::FNEARBYINT:
	case ISD::FROUND:
	case ISD::FTRUNC:
	case ISD::FNEG:
	case ISD::FSQRT:
	case ISD::FSIN:
	case ISD::FCOS:
	case ISD::FLOG:
	case ISD::FLOG2:
	case ISD::FLOG10:
	case ISD::FABS:
	case ISD::FEXP:
	case ISD::FEXP2:
	Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
	Tmp2 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
	Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
	Tmp2, DAG.getIntPtrConstant(0, dl)));
	break;
	case ISD::BUILD_VECTOR: {
	MVT EltVT = OVT.getVectorElementType();
	MVT NewEltVT = NVT.getVectorElementType();

	// Handle bitcasts to a different vector type with the same total bit size
	//
	// e.g. v2i64 = build_vector i64:x, i64:y => v4i32
	// =>
	// v4i32 = concat_vectors (v2i32 (bitcast i64:x)), (v2i32 (bitcast i64:y))

	assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
	"Invalid promote type for build_vector");
	assert(NewEltVT.bitsLT(EltVT) && "not handled");

	MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);

	SmallVector<SDValue, 8> NewOps;
	for (unsigned I = 0, E = Node->getNumOperands(); I != E; ++I) {
	SDValue Op = Node->getOperand(I);
	NewOps.push_back(DAG.getNode(ISD::BITCAST, SDLoc(Op), MidVT, Op));
	}

	SDLoc SL(Node);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewOps);
	SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
	Results.push_back(CvtVec);
	break;
	}
	case ISD::EXTRACT_VECTOR_ELT: {
	MVT EltVT = OVT.getVectorElementType();
	MVT NewEltVT = NVT.getVectorElementType();

	// Handle bitcasts to a different vector type with the same total bit size.
	//
	// e.g. v2i64 = extract_vector_elt x:v2i64, y:i32
	// =>
	// v4i32:castx = bitcast x:v2i64
	//
	// i64 = bitcast
	// (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
	// (i32 (extract_vector_elt castx, (2 * y + 1)))
	//

	assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
	"Invalid promote type for extract_vector_elt");
	assert(NewEltVT.bitsLT(EltVT) && "not handled");

	MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
	unsigned NewEltsPerOldElt = MidVT.getVectorNumElements();

	SDValue Idx = Node->getOperand(1);
	EVT IdxVT = Idx.getValueType();
	SDLoc SL(Node);
	SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SL, IdxVT);
	SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor);

	SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0));

	SmallVector<SDValue, 8> NewOps;
	for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
	SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT);
	SDValue TmpIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset);

	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT,
	CastVec, TmpIdx);
	NewOps.push_back(Elt);
	}

	SDValue NewVec = DAG.getBuildVector(MidVT, SL, NewOps);
	Results.push_back(DAG.getNode(ISD::BITCAST, SL, EltVT, NewVec));
	break;
	}
	case ISD::INSERT_VECTOR_ELT: {
	MVT EltVT = OVT.getVectorElementType();
	MVT NewEltVT = NVT.getVectorElementType();

	// Handle bitcasts to a different vector type with the same total bit size
	//
	// e.g. v2i64 = insert_vector_elt x:v2i64, y:i64, z:i32
	// =>
	// v4i32:castx = bitcast x:v2i64
	// v2i32:casty = bitcast y:i64
	//
	// v2i64 = bitcast
	// (v4i32 insert_vector_elt
	// (v4i32 insert_vector_elt v4i32:castx,
	// (extract_vector_elt casty, 0), 2 * z),
	// (extract_vector_elt casty, 1), (2 * z + 1))

	assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
	"Invalid promote type for insert_vector_elt");
	assert(NewEltVT.bitsLT(EltVT) && "not handled");

	MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
	unsigned NewEltsPerOldElt = MidVT.getVectorNumElements();

	SDValue Val = Node->getOperand(1);
	SDValue Idx = Node->getOperand(2);
	EVT IdxVT = Idx.getValueType();
	SDLoc SL(Node);

	SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SDLoc(), IdxVT);
	SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor);

	SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0));
	SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val);

	SDValue NewVec = CastVec;
	for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
	SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT);
	SDValue InEltIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset);

	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT,
	CastVal, IdxOffset);

	NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NVT,
	NewVec, Elt, InEltIdx);
	}

	Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewVec));
	break;
	}
	case ISD::SCALAR_TO_VECTOR: {
	MVT EltVT = OVT.getVectorElementType();
	MVT NewEltVT = NVT.getVectorElementType();

	// Handle bitcasts to different vector type with the same total bit size.
	//
	// e.g. v2i64 = scalar_to_vector x:i64
	// =>
	// concat_vectors (v2i32 bitcast x:i64), (v2i32 undef)
	//

	MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
	SDValue Val = Node->getOperand(0);
	SDLoc SL(Node);

	SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val);
	SDValue Undef = DAG.getUNDEF(MidVT);

	SmallVector<SDValue, 8> NewElts;
	NewElts.push_back(CastVal);
	for (unsigned I = 1, NElts = OVT.getVectorNumElements(); I != NElts; ++I)
	NewElts.push_back(Undef);

	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewElts);
	SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
	Results.push_back(CvtVec);
	break;
	}
	}

	// Replace the original node with the legalized result.
	if (!Results.empty()) {
	DEBUG(dbgs() << "Successfully promoted node\n");
	ReplaceNode(Node, Results.data());
	} else
	DEBUG(dbgs() << "Could not promote node\n");
	}

	/// This is the entry point for the file.
	void SelectionDAG::Legalize() {
	AssignTopologicalOrder();

	SmallPtrSet<SDNode *, 16> LegalizedNodes;
	// Use a delete listener to remove nodes which were deleted during
	// legalization from LegalizeNodes. This is needed to handle the situation
	// where a new node is allocated by the object pool to the same address of a
	// previously deleted node.
	DAGNodeDeletedListener DeleteListener(
	*this,
	[&LegalizedNodes](SDNode N, SDNode E) { LegalizedNodes.erase(N); });

	SelectionDAGLegalize Legalizer(*this, LegalizedNodes);

	// Visit all the nodes. We start in topological order, so that we see
	// nodes with their original operands intact. Legalization can produce
	// new nodes which may themselves need to be legalized. Iterate until all
	// nodes have been legalized.
	while (true) {
	bool AnyLegalized = false;
	for (auto NI = allnodes_end(); NI != allnodes_begin();) {
	--NI;

	SDNode N = &NI;
	if (N->use_empty() && N != getRoot().getNode()) {
	++NI;
	DeleteNode(N);
	continue;
	}

	if (LegalizedNodes.insert(N).second) {
	AnyLegalized = true;
	Legalizer.LegalizeOp(N);

	if (N->use_empty() && N != getRoot().getNode()) {
	++NI;
	DeleteNode(N);
	}
	}
	}
	if (!AnyLegalized)
	break;

	}

	// Remove dead nodes now.
	RemoveDeadNodes();
	}

	bool SelectionDAG::LegalizeOp(SDNode *N,
	SmallSetVector<SDNode *, 16> &UpdatedNodes) {
	SmallPtrSet<SDNode *, 16> LegalizedNodes;
	SelectionDAGLegalize Legalizer(*this, LegalizedNodes, &UpdatedNodes);

	// Directly insert the node in question, and legalize it. This will recurse
	// as needed through operands.
	LegalizedNodes.insert(N);
	Legalizer.LegalizeOp(N);

	return LegalizedNodes.count(N);
	}
	Index: head/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp (revision 328816)
	+++ head/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp (revision 328817)
	@@ -1,3119 +1,3131 @@
	//===- ScheduleDAGRRList.cpp - Reg pressure reduction list scheduler ------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This implements bottom-up and top-down register pressure reduction list
	// schedulers, using standard algorithms. The basic approach uses a priority
	// queue of available nodes to schedule. One at a time, nodes are taken from
	// the priority queue (thus in priority order), checked for legality to
	// schedule, and emitted if legal.
	//
	//===----------------------------------------------------------------------===//

	#include "ScheduleDAGSDNodes.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/ScheduleDAG.h"
	#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
	#include "llvm/CodeGen/SchedulerRegistry.h"
	#include "llvm/CodeGen/SelectionDAGISel.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetOpcodes.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <cstdlib>
	#include <iterator>
	#include <limits>
	#include <memory>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "pre-RA-sched"

	STATISTIC(NumBacktracks, "Number of times scheduler backtracked");
	STATISTIC(NumUnfolds, "Number of nodes unfolded");
	STATISTIC(NumDups, "Number of duplicated nodes");
	STATISTIC(NumPRCopies, "Number of physical register copies");

	static RegisterScheduler
	burrListDAGScheduler("list-burr",
	"Bottom-up register reduction list scheduling",
	createBURRListDAGScheduler);

	static RegisterScheduler
	sourceListDAGScheduler("source",
	"Similar to list-burr but schedules in source "
	"order when possible",
	createSourceListDAGScheduler);

	static RegisterScheduler
	hybridListDAGScheduler("list-hybrid",
	"Bottom-up register pressure aware list scheduling "
	"which tries to balance latency and register pressure",
	createHybridListDAGScheduler);

	static RegisterScheduler
	ILPListDAGScheduler("list-ilp",
	"Bottom-up register pressure aware list scheduling "
	"which tries to balance ILP and register pressure",
	createILPListDAGScheduler);

	static cl::opt<bool> DisableSchedCycles(
	"disable-sched-cycles", cl::Hidden, cl::init(false),
	cl::desc("Disable cycle-level precision during preRA scheduling"));

	// Temporary sched=list-ilp flags until the heuristics are robust.
	// Some options are also available under sched=list-hybrid.
	static cl::opt<bool> DisableSchedRegPressure(
	"disable-sched-reg-pressure", cl::Hidden, cl::init(false),
	cl::desc("Disable regpressure priority in sched=list-ilp"));
	static cl::opt<bool> DisableSchedLiveUses(
	"disable-sched-live-uses", cl::Hidden, cl::init(true),
	cl::desc("Disable live use priority in sched=list-ilp"));
	static cl::opt<bool> DisableSchedVRegCycle(
	"disable-sched-vrcycle", cl::Hidden, cl::init(false),
	cl::desc("Disable virtual register cycle interference checks"));
	static cl::opt<bool> DisableSchedPhysRegJoin(
	"disable-sched-physreg-join", cl::Hidden, cl::init(false),
	cl::desc("Disable physreg def-use affinity"));
	static cl::opt<bool> DisableSchedStalls(
	"disable-sched-stalls", cl::Hidden, cl::init(true),
	cl::desc("Disable no-stall priority in sched=list-ilp"));
	static cl::opt<bool> DisableSchedCriticalPath(
	"disable-sched-critical-path", cl::Hidden, cl::init(false),
	cl::desc("Disable critical path priority in sched=list-ilp"));
	static cl::opt<bool> DisableSchedHeight(
	"disable-sched-height", cl::Hidden, cl::init(false),
	cl::desc("Disable scheduled-height priority in sched=list-ilp"));
	static cl::opt<bool> Disable2AddrHack(
	"disable-2addr-hack", cl::Hidden, cl::init(true),
	cl::desc("Disable scheduler's two-address hack"));

	static cl::opt<int> MaxReorderWindow(
	"max-sched-reorder", cl::Hidden, cl::init(6),
	cl::desc("Number of instructions to allow ahead of the critical path "
	"in sched=list-ilp"));

	static cl::opt<unsigned> AvgIPC(
	"sched-avg-ipc", cl::Hidden, cl::init(1),
	cl::desc("Average inst/cycle whan no target itinerary exists."));

	namespace {

	//===----------------------------------------------------------------------===//
	/// ScheduleDAGRRList - The actual register reduction list scheduler
	/// implementation. This supports both top-down and bottom-up scheduling.
	///
	class ScheduleDAGRRList : public ScheduleDAGSDNodes {
	private:
	/// NeedLatency - True if the scheduler will make use of latency information.
	bool NeedLatency;

	/// AvailableQueue - The priority queue to use for the available SUnits.
	SchedulingPriorityQueue *AvailableQueue;

	/// PendingQueue - This contains all of the instructions whose operands have
	/// been issued, but their results are not ready yet (due to the latency of
	/// the operation). Once the operands becomes available, the instruction is
	/// added to the AvailableQueue.
	std::vector<SUnit *> PendingQueue;

	/// HazardRec - The hazard recognizer to use.
	ScheduleHazardRecognizer *HazardRec;

	/// CurCycle - The current scheduler state corresponds to this cycle.
	unsigned CurCycle = 0;

	/// MinAvailableCycle - Cycle of the soonest available instruction.
	unsigned MinAvailableCycle;

	/// IssueCount - Count instructions issued in this cycle
	/// Currently valid only for bottom-up scheduling.
	unsigned IssueCount;

	/// LiveRegDefs - A set of physical registers and their definition
	/// that are "live". These nodes must be scheduled before any other nodes that
	/// modifies the registers can be scheduled.
	unsigned NumLiveRegs;
	std::unique_ptr<SUnit*[]> LiveRegDefs;
	std::unique_ptr<SUnit*[]> LiveRegGens;

	// Collect interferences between physical register use/defs.
	// Each interference is an SUnit and set of physical registers.
	SmallVector<SUnit*, 4> Interferences;

	using LRegsMapT = DenseMap<SUnit *, SmallVector<unsigned, 4>>;

	LRegsMapT LRegsMap;

	/// Topo - A topological ordering for SUnits which permits fast IsReachable
	/// and similar queries.
	ScheduleDAGTopologicalSort Topo;

	// Hack to keep track of the inverse of FindCallSeqStart without more crazy
	// DAG crawling.
	DenseMap<SUnit, SUnit> CallSeqEndForStart;

	public:
	ScheduleDAGRRList(MachineFunction &mf, bool needlatency,
	SchedulingPriorityQueue *availqueue,
	CodeGenOpt::Level OptLevel)
	: ScheduleDAGSDNodes(mf),
	NeedLatency(needlatency), AvailableQueue(availqueue),
	Topo(SUnits, nullptr) {
	const TargetSubtargetInfo &STI = mf.getSubtarget();
	if (DisableSchedCycles \|\| !NeedLatency)
	HazardRec = new ScheduleHazardRecognizer();
	else
	HazardRec = STI.getInstrInfo()->CreateTargetHazardRecognizer(&STI, this);
	}

	~ScheduleDAGRRList() override {
	delete HazardRec;
	delete AvailableQueue;
	}

	void Schedule() override;

	ScheduleHazardRecognizer *getHazardRec() { return HazardRec; }

	/// IsReachable - Checks if SU is reachable from TargetSU.
	bool IsReachable(const SUnit SU, const SUnit TargetSU) {
	return Topo.IsReachable(SU, TargetSU);
	}

	/// WillCreateCycle - Returns true if adding an edge from SU to TargetSU will
	/// create a cycle.
	bool WillCreateCycle(SUnit SU, SUnit TargetSU) {
	return Topo.WillCreateCycle(SU, TargetSU);
	}

	/// AddPred - adds a predecessor edge to SUnit SU.
	/// This returns true if this is a new predecessor.
	/// Updates the topological ordering if required.
	void AddPred(SUnit *SU, const SDep &D) {
	Topo.AddPred(SU, D.getSUnit());
	SU->addPred(D);
	}

	/// RemovePred - removes a predecessor edge from SUnit SU.
	/// This returns true if an edge was removed.
	/// Updates the topological ordering if required.
	void RemovePred(SUnit *SU, const SDep &D) {
	Topo.RemovePred(SU, D.getSUnit());
	SU->removePred(D);
	}

	private:
	bool isReady(SUnit *SU) {
	return DisableSchedCycles \|\| !AvailableQueue->hasReadyFilter() \|\|
	AvailableQueue->isReady(SU);
	}

	void ReleasePred(SUnit SU, const SDep PredEdge);
	void ReleasePredecessors(SUnit *SU);
	void ReleasePending();
	void AdvanceToCycle(unsigned NextCycle);
	void AdvancePastStalls(SUnit *SU);
	void EmitNode(SUnit *SU);
	void ScheduleNodeBottomUp(SUnit*);
	void CapturePred(SDep *PredEdge);
	void UnscheduleNodeBottomUp(SUnit*);
	void RestoreHazardCheckerBottomUp();
	void BacktrackBottomUp(SUnit, SUnit);
	SUnit TryUnfoldSU(SUnit );
	SUnit CopyAndMoveSuccessors(SUnit);
	void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
	const TargetRegisterClass*,
	const TargetRegisterClass*,
	SmallVectorImpl<SUnit*>&);
	bool DelayForLiveRegsBottomUp(SUnit*, SmallVectorImpl<unsigned>&);

	void releaseInterferences(unsigned Reg = 0);

	SUnit *PickNodeToScheduleBottomUp();
	void ListScheduleBottomUp();

	/// CreateNewSUnit - Creates a new SUnit and returns a pointer to it.
	/// Updates the topological ordering if required.
	SUnit CreateNewSUnit(SDNode N) {
	unsigned NumSUnits = SUnits.size();
	SUnit *NewNode = newSUnit(N);
	// Update the topological ordering.
	if (NewNode->NodeNum >= NumSUnits)
	Topo.InitDAGTopologicalSorting();
	return NewNode;
	}

	/// CreateClone - Creates a new SUnit from an existing one.
	/// Updates the topological ordering if required.
	SUnit CreateClone(SUnit N) {
	unsigned NumSUnits = SUnits.size();
	SUnit *NewNode = Clone(N);
	// Update the topological ordering.
	if (NewNode->NodeNum >= NumSUnits)
	Topo.InitDAGTopologicalSorting();
	return NewNode;
	}

	/// forceUnitLatencies - Register-pressure-reducing scheduling doesn't
	/// need actual latency information but the hybrid scheduler does.
	bool forceUnitLatencies() const override {
	return !NeedLatency;
	}
	};

	} // end anonymous namespace

	/// GetCostForDef - Looks up the register class and cost for a given definition.
	/// Typically this just means looking up the representative register class,
	/// but for untyped values (MVT::Untyped) it means inspecting the node's
	/// opcode to determine what register class is being generated.
	static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
	const TargetLowering *TLI,
	const TargetInstrInfo *TII,
	const TargetRegisterInfo *TRI,
	unsigned &RegClass, unsigned &Cost,
	const MachineFunction &MF) {
	MVT VT = RegDefPos.GetValue();

	// Special handling for untyped values. These values can only come from
	// the expansion of custom DAG-to-DAG patterns.
	if (VT == MVT::Untyped) {
	const SDNode *Node = RegDefPos.GetNode();

	// Special handling for CopyFromReg of untyped values.
	if (!Node->isMachineOpcode() && Node->getOpcode() == ISD::CopyFromReg) {
	unsigned Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
	const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(Reg);
	RegClass = RC->getID();
	Cost = 1;
	return;
	}

	unsigned Opcode = Node->getMachineOpcode();
	if (Opcode == TargetOpcode::REG_SEQUENCE) {
	unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
	const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
	RegClass = RC->getID();
	Cost = 1;
	return;
	}

	unsigned Idx = RegDefPos.GetIdx();
	const MCInstrDesc Desc = TII->get(Opcode);
	const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI, MF);
	RegClass = RC->getID();
	// FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a
	// better way to determine it.
	Cost = 1;
	} else {
	RegClass = TLI->getRepRegClassFor(VT)->getID();
	Cost = TLI->getRepRegClassCostFor(VT);
	}
	}

	/// Schedule - Schedule the DAG using list scheduling.
	void ScheduleDAGRRList::Schedule() {
	DEBUG(dbgs() << "********** List Scheduling " << printMBBReference(*BB)
	<< " '" << BB->getName() << "' **********\n");

	CurCycle = 0;
	IssueCount = 0;
	MinAvailableCycle =
	DisableSchedCycles ? 0 : std::numeric_limits<unsigned>::max();
	NumLiveRegs = 0;
	// Allocate slots for each physical register, plus one for a special register
	// to track the virtual resource of a calling sequence.
	LiveRegDefs.reset(new SUnit*[TRI->getNumRegs() + 1]());
	LiveRegGens.reset(new SUnit*[TRI->getNumRegs() + 1]());
	CallSeqEndForStart.clear();
	assert(Interferences.empty() && LRegsMap.empty() && "stale Interferences");

	// Build the scheduling graph.
	BuildSchedGraph(nullptr);

	DEBUG(for (SUnit &SU : SUnits)
	SU.dumpAll(this));
	Topo.InitDAGTopologicalSorting();

	AvailableQueue->initNodes(SUnits);

	HazardRec->Reset();

	// Execute the actual scheduling loop.
	ListScheduleBottomUp();

	AvailableQueue->releaseState();

	DEBUG({
	dbgs() << "* Final schedule *\n";
	dumpSchedule();
	dbgs() << '\n';
	});
	}

	//===----------------------------------------------------------------------===//
	// Bottom-Up Scheduling
	//===----------------------------------------------------------------------===//

	/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. Add it to
	/// the AvailableQueue if the count reaches zero. Also update its cycle bound.
	void ScheduleDAGRRList::ReleasePred(SUnit SU, const SDep PredEdge) {
	SUnit *PredSU = PredEdge->getSUnit();

	#ifndef NDEBUG
	if (PredSU->NumSuccsLeft == 0) {
	dbgs() << "* Scheduling failed! *\n";
	PredSU->dump(this);
	dbgs() << " has been released too many times!\n";
	llvm_unreachable(nullptr);
	}
	#endif
	--PredSU->NumSuccsLeft;

	if (!forceUnitLatencies()) {
	// Updating predecessor's height. This is now the cycle when the
	// predecessor can be scheduled without causing a pipeline stall.
	PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge->getLatency());
	}

	// If all the node's successors are scheduled, this node is ready
	// to be scheduled. Ignore the special EntrySU node.
	if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) {
	PredSU->isAvailable = true;

	unsigned Height = PredSU->getHeight();
	if (Height < MinAvailableCycle)
	MinAvailableCycle = Height;

	if (isReady(PredSU)) {
	AvailableQueue->push(PredSU);
	}
	// CapturePred and others may have left the node in the pending queue, avoid
	// adding it twice.
	else if (!PredSU->isPending) {
	PredSU->isPending = true;
	PendingQueue.push_back(PredSU);
	}
	}
	}

	/// IsChainDependent - Test if Outer is reachable from Inner through
	/// chain dependencies.
	static bool IsChainDependent(SDNode Outer, SDNode Inner,
	unsigned NestLevel,
	const TargetInstrInfo *TII) {
	SDNode *N = Outer;
	while (true) {
	if (N == Inner)
	return true;
	// For a TokenFactor, examine each operand. There may be multiple ways
	// to get to the CALLSEQ_BEGIN, but we need to find the path with the
	// most nesting in order to ensure that we find the corresponding match.
	if (N->getOpcode() == ISD::TokenFactor) {
	for (const SDValue &Op : N->op_values())
	if (IsChainDependent(Op.getNode(), Inner, NestLevel, TII))
	return true;
	return false;
	}
	// Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
	if (N->isMachineOpcode()) {
	if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
	++NestLevel;
	} else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
	if (NestLevel == 0)
	return false;
	--NestLevel;
	}
	}
	// Otherwise, find the chain and continue climbing.
	for (const SDValue &Op : N->op_values())
	if (Op.getValueType() == MVT::Other) {
	N = Op.getNode();
	goto found_chain_operand;
	}
	return false;
	found_chain_operand:;
	if (N->getOpcode() == ISD::EntryToken)
	return false;
	}
	}

	/// FindCallSeqStart - Starting from the (lowered) CALLSEQ_END node, locate
	/// the corresponding (lowered) CALLSEQ_BEGIN node.
	///
	/// NestLevel and MaxNested are used in recursion to indcate the current level
	/// of nesting of CALLSEQ_BEGIN and CALLSEQ_END pairs, as well as the maximum
	/// level seen so far.
	///
	/// TODO: It would be better to give CALLSEQ_END an explicit operand to point
	/// to the corresponding CALLSEQ_BEGIN to avoid needing to search for it.
	static SDNode *
	FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest,
	const TargetInstrInfo *TII) {
	while (true) {
	// For a TokenFactor, examine each operand. There may be multiple ways
	// to get to the CALLSEQ_BEGIN, but we need to find the path with the
	// most nesting in order to ensure that we find the corresponding match.
	if (N->getOpcode() == ISD::TokenFactor) {
	SDNode *Best = nullptr;
	unsigned BestMaxNest = MaxNest;
	for (const SDValue &Op : N->op_values()) {
	unsigned MyNestLevel = NestLevel;
	unsigned MyMaxNest = MaxNest;
	if (SDNode *New = FindCallSeqStart(Op.getNode(),
	MyNestLevel, MyMaxNest, TII))
	if (!Best \|\| (MyMaxNest > BestMaxNest)) {
	Best = New;
	BestMaxNest = MyMaxNest;
	}
	}
	assert(Best);
	MaxNest = BestMaxNest;
	return Best;
	}
	// Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
	if (N->isMachineOpcode()) {
	if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
	++NestLevel;
	MaxNest = std::max(MaxNest, NestLevel);
	} else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
	assert(NestLevel != 0);
	--NestLevel;
	if (NestLevel == 0)
	return N;
	}
	}
	// Otherwise, find the chain and continue climbing.
	for (const SDValue &Op : N->op_values())
	if (Op.getValueType() == MVT::Other) {
	N = Op.getNode();
	goto found_chain_operand;
	}
	return nullptr;
	found_chain_operand:;
	if (N->getOpcode() == ISD::EntryToken)
	return nullptr;
	}
	}

	/// Call ReleasePred for each predecessor, then update register live def/gen.
	/// Always update LiveRegDefs for a register dependence even if the current SU
	/// also defines the register. This effectively create one large live range
	/// across a sequence of two-address node. This is important because the
	/// entire chain must be scheduled together. Example:
	///
	/// flags = (3) add
	/// flags = (2) addc flags
	/// flags = (1) addc flags
	///
	/// results in
	///
	/// LiveRegDefs[flags] = 3
	/// LiveRegGens[flags] = 1
	///
	/// If (2) addc is unscheduled, then (1) addc must also be unscheduled to avoid
	/// interference on flags.
	void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) {
	// Bottom up: release predecessors
	for (SDep &Pred : SU->Preds) {
	ReleasePred(SU, &Pred);
	if (Pred.isAssignedRegDep()) {
	// This is a physical register dependency and it's impossible or
	// expensive to copy the register. Make sure nothing that can
	// clobber the register is scheduled between the predecessor and
	// this node.
	SUnit *RegDef = LiveRegDefs[Pred.getReg()]; (void)RegDef;
	assert((!RegDef \|\| RegDef == SU \|\| RegDef == Pred.getSUnit()) &&
	"interference on register dependence");
	LiveRegDefs[Pred.getReg()] = Pred.getSUnit();
	if (!LiveRegGens[Pred.getReg()]) {
	++NumLiveRegs;
	LiveRegGens[Pred.getReg()] = SU;
	}
	}
	}

	// If we're scheduling a lowered CALLSEQ_END, find the corresponding
	// CALLSEQ_BEGIN. Inject an artificial physical register dependence between
	// these nodes, to prevent other calls from being interscheduled with them.
	unsigned CallResource = TRI->getNumRegs();
	if (!LiveRegDefs[CallResource])
	for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode())
	if (Node->isMachineOpcode() &&
	Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
	unsigned NestLevel = 0;
	unsigned MaxNest = 0;
	SDNode *N = FindCallSeqStart(Node, NestLevel, MaxNest, TII);
	assert(N && "Must find call sequence start");

	SUnit *Def = &SUnits[N->getNodeId()];
	CallSeqEndForStart[Def] = SU;

	++NumLiveRegs;
	LiveRegDefs[CallResource] = Def;
	LiveRegGens[CallResource] = SU;
	break;
	}
	}

	/// Check to see if any of the pending instructions are ready to issue. If
	/// so, add them to the available queue.
	void ScheduleDAGRRList::ReleasePending() {
	if (DisableSchedCycles) {
	assert(PendingQueue.empty() && "pending instrs not allowed in this mode");
	return;
	}

	// If the available queue is empty, it is safe to reset MinAvailableCycle.
	if (AvailableQueue->empty())
	MinAvailableCycle = std::numeric_limits<unsigned>::max();

	// Check to see if any of the pending instructions are ready to issue. If
	// so, add them to the available queue.
	for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
	unsigned ReadyCycle = PendingQueue[i]->getHeight();
	if (ReadyCycle < MinAvailableCycle)
	MinAvailableCycle = ReadyCycle;

	if (PendingQueue[i]->isAvailable) {
	if (!isReady(PendingQueue[i]))
	continue;
	AvailableQueue->push(PendingQueue[i]);
	}
	PendingQueue[i]->isPending = false;
	PendingQueue[i] = PendingQueue.back();
	PendingQueue.pop_back();
	--i; --e;
	}
	}

	/// Move the scheduler state forward by the specified number of Cycles.
	void ScheduleDAGRRList::AdvanceToCycle(unsigned NextCycle) {
	if (NextCycle <= CurCycle)
	return;

	IssueCount = 0;
	AvailableQueue->setCurCycle(NextCycle);
	if (!HazardRec->isEnabled()) {
	// Bypass lots of virtual calls in case of long latency.
	CurCycle = NextCycle;
	}
	else {
	for (; CurCycle != NextCycle; ++CurCycle) {
	HazardRec->RecedeCycle();
	}
	}
	// FIXME: Instead of visiting the pending Q each time, set a dirty flag on the
	// available Q to release pending nodes at least once before popping.
	ReleasePending();
	}

	/// Move the scheduler state forward until the specified node's dependents are
	/// ready and can be scheduled with no resource conflicts.
	void ScheduleDAGRRList::AdvancePastStalls(SUnit *SU) {
	if (DisableSchedCycles)
	return;

	// FIXME: Nodes such as CopyFromReg probably should not advance the current
	// cycle. Otherwise, we can wrongly mask real stalls. If the non-machine node
	// has predecessors the cycle will be advanced when they are scheduled.
	// But given the crude nature of modeling latency though such nodes, we
	// currently need to treat these nodes like real instructions.
	// if (!SU->getNode() \|\| !SU->getNode()->isMachineOpcode()) return;

	unsigned ReadyCycle = SU->getHeight();

	// Bump CurCycle to account for latency. We assume the latency of other
	// available instructions may be hidden by the stall (not a full pipe stall).
	// This updates the hazard recognizer's cycle before reserving resources for
	// this instruction.
	AdvanceToCycle(ReadyCycle);

	// Calls are scheduled in their preceding cycle, so don't conflict with
	// hazards from instructions after the call. EmitNode will reset the
	// scoreboard state before emitting the call.
	if (SU->isCall)
	return;

	// FIXME: For resource conflicts in very long non-pipelined stages, we
	// should probably skip ahead here to avoid useless scoreboard checks.
	int Stalls = 0;
	while (true) {
	ScheduleHazardRecognizer::HazardType HT =
	HazardRec->getHazardType(SU, -Stalls);

	if (HT == ScheduleHazardRecognizer::NoHazard)
	break;

	++Stalls;
	}
	AdvanceToCycle(CurCycle + Stalls);
	}

	/// Record this SUnit in the HazardRecognizer.
	/// Does not update CurCycle.
	void ScheduleDAGRRList::EmitNode(SUnit *SU) {
	if (!HazardRec->isEnabled())
	return;

	// Check for phys reg copy.
	if (!SU->getNode())
	return;

	switch (SU->getNode()->getOpcode()) {
	default:
	assert(SU->getNode()->isMachineOpcode() &&
	"This target-independent node should not be scheduled.");
	break;
	case ISD::MERGE_VALUES:
	case ISD::TokenFactor:
	case ISD::LIFETIME_START:
	case ISD::LIFETIME_END:
	case ISD::CopyToReg:
	case ISD::CopyFromReg:
	case ISD::EH_LABEL:
	// Noops don't affect the scoreboard state. Copies are likely to be
	// removed.
	return;
	case ISD::INLINEASM:
	// For inline asm, clear the pipeline state.
	HazardRec->Reset();
	return;
	}
	if (SU->isCall) {
	// Calls are scheduled with their preceding instructions. For bottom-up
	// scheduling, clear the pipeline state before emitting.
	HazardRec->Reset();
	}

	HazardRec->EmitInstruction(SU);
	}

	static void resetVRegCycle(SUnit *SU);

	/// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending
	/// count of its predecessors. If a predecessor pending count is zero, add it to
	/// the Available queue.
	void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
	DEBUG(dbgs() << "\n*** Scheduling [" << CurCycle << "]: ");
	DEBUG(SU->dump(this));

	#ifndef NDEBUG
	if (CurCycle < SU->getHeight())
	DEBUG(dbgs() << " Height [" << SU->getHeight()
	<< "] pipeline stall!\n");
	#endif

	// FIXME: Do not modify node height. It may interfere with
	// backtracking. Instead add a "ready cycle" to SUnit. Before scheduling the
	// node its ready cycle can aid heuristics, and after scheduling it can
	// indicate the scheduled cycle.
	SU->setHeightToAtLeast(CurCycle);

	// Reserve resources for the scheduled instruction.
	EmitNode(SU);

	Sequence.push_back(SU);

	AvailableQueue->scheduledNode(SU);

	// If HazardRec is disabled, and each inst counts as one cycle, then
	// advance CurCycle before ReleasePredecessors to avoid useless pushes to
	// PendingQueue for schedulers that implement HasReadyFilter.
	if (!HazardRec->isEnabled() && AvgIPC < 2)
	AdvanceToCycle(CurCycle + 1);

	// Update liveness of predecessors before successors to avoid treating a
	// two-address node as a live range def.
	ReleasePredecessors(SU);

	// Release all the implicit physical register defs that are live.
	for (SDep &Succ : SU->Succs) {
	// LiveRegDegs[Succ.getReg()] != SU when SU is a two-address node.
	if (Succ.isAssignedRegDep() && LiveRegDefs[Succ.getReg()] == SU) {
	assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
	--NumLiveRegs;
	LiveRegDefs[Succ.getReg()] = nullptr;
	LiveRegGens[Succ.getReg()] = nullptr;
	releaseInterferences(Succ.getReg());
	}
	}
	// Release the special call resource dependence, if this is the beginning
	// of a call.
	unsigned CallResource = TRI->getNumRegs();
	if (LiveRegDefs[CallResource] == SU)
	for (const SDNode *SUNode = SU->getNode(); SUNode;
	SUNode = SUNode->getGluedNode()) {
	if (SUNode->isMachineOpcode() &&
	SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
	assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
	--NumLiveRegs;
	LiveRegDefs[CallResource] = nullptr;
	LiveRegGens[CallResource] = nullptr;
	releaseInterferences(CallResource);
	}
	}

	resetVRegCycle(SU);

	SU->isScheduled = true;

	// Conditions under which the scheduler should eagerly advance the cycle:
	// (1) No available instructions
	// (2) All pipelines full, so available instructions must have hazards.
	//
	// If HazardRec is disabled, the cycle was pre-advanced before calling
	// ReleasePredecessors. In that case, IssueCount should remain 0.
	//
	// Check AvailableQueue after ReleasePredecessors in case of zero latency.
	if (HazardRec->isEnabled() \|\| AvgIPC > 1) {
	if (SU->getNode() && SU->getNode()->isMachineOpcode())
	++IssueCount;
	if ((HazardRec->isEnabled() && HazardRec->atIssueLimit())
	\|\| (!HazardRec->isEnabled() && IssueCount == AvgIPC))
	AdvanceToCycle(CurCycle + 1);
	}
	}

	/// CapturePred - This does the opposite of ReleasePred. Since SU is being
	/// unscheduled, increase the succ left count of its predecessors. Remove
	/// them from AvailableQueue if necessary.
	void ScheduleDAGRRList::CapturePred(SDep *PredEdge) {
	SUnit *PredSU = PredEdge->getSUnit();
	if (PredSU->isAvailable) {
	PredSU->isAvailable = false;
	if (!PredSU->isPending)
	AvailableQueue->remove(PredSU);
	}

	assert(PredSU->NumSuccsLeft < std::numeric_limits<unsigned>::max() &&
	"NumSuccsLeft will overflow!");
	++PredSU->NumSuccsLeft;
	}

	/// UnscheduleNodeBottomUp - Remove the node from the schedule, update its and
	/// its predecessor states to reflect the change.
	void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
	DEBUG(dbgs() << "*** Unscheduling [" << SU->getHeight() << "]: ");
	DEBUG(SU->dump(this));

	for (SDep &Pred : SU->Preds) {
	CapturePred(&Pred);
	if (Pred.isAssignedRegDep() && SU == LiveRegGens[Pred.getReg()]){
	assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
	assert(LiveRegDefs[Pred.getReg()] == Pred.getSUnit() &&
	"Physical register dependency violated?");
	--NumLiveRegs;
	LiveRegDefs[Pred.getReg()] = nullptr;
	LiveRegGens[Pred.getReg()] = nullptr;
	releaseInterferences(Pred.getReg());
	}
	}

	// Reclaim the special call resource dependence, if this is the beginning
	// of a call.
	unsigned CallResource = TRI->getNumRegs();
	for (const SDNode *SUNode = SU->getNode(); SUNode;
	SUNode = SUNode->getGluedNode()) {
	if (SUNode->isMachineOpcode() &&
	SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
	SUnit *SeqEnd = CallSeqEndForStart[SU];
	assert(SeqEnd && "Call sequence start/end must be known");
	assert(!LiveRegDefs[CallResource]);
	assert(!LiveRegGens[CallResource]);
	++NumLiveRegs;
	LiveRegDefs[CallResource] = SU;
	LiveRegGens[CallResource] = SeqEnd;
	}
	}

	// Release the special call resource dependence, if this is the end
	// of a call.
	if (LiveRegGens[CallResource] == SU)
	for (const SDNode *SUNode = SU->getNode(); SUNode;
	SUNode = SUNode->getGluedNode()) {
	if (SUNode->isMachineOpcode() &&
	SUNode->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
	assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
	assert(LiveRegDefs[CallResource]);
	assert(LiveRegGens[CallResource]);
	--NumLiveRegs;
	LiveRegDefs[CallResource] = nullptr;
	LiveRegGens[CallResource] = nullptr;
	releaseInterferences(CallResource);
	}
	}

	for (auto &Succ : SU->Succs) {
	if (Succ.isAssignedRegDep()) {
	auto Reg = Succ.getReg();
	if (!LiveRegDefs[Reg])
	++NumLiveRegs;
	// This becomes the nearest def. Note that an earlier def may still be
	// pending if this is a two-address node.
	LiveRegDefs[Reg] = SU;

	// Update LiveRegGen only if was empty before this unscheduling.
	// This is to avoid incorrect updating LiveRegGen set in previous run.
	if (!LiveRegGens[Reg]) {
	// Find the successor with the lowest height.
	LiveRegGens[Reg] = Succ.getSUnit();
	for (auto &Succ2 : SU->Succs) {
	if (Succ2.isAssignedRegDep() && Succ2.getReg() == Reg &&
	Succ2.getSUnit()->getHeight() < LiveRegGens[Reg]->getHeight())
	LiveRegGens[Reg] = Succ2.getSUnit();
	}
	}
	}
	}
	if (SU->getHeight() < MinAvailableCycle)
	MinAvailableCycle = SU->getHeight();

	SU->setHeightDirty();
	SU->isScheduled = false;
	SU->isAvailable = true;
	if (!DisableSchedCycles && AvailableQueue->hasReadyFilter()) {
	// Don't make available until backtracking is complete.
	SU->isPending = true;
	PendingQueue.push_back(SU);
	}
	else {
	AvailableQueue->push(SU);
	}
	AvailableQueue->unscheduledNode(SU);
	}

	/// After backtracking, the hazard checker needs to be restored to a state
	/// corresponding the current cycle.
	void ScheduleDAGRRList::RestoreHazardCheckerBottomUp() {
	HazardRec->Reset();

	unsigned LookAhead = std::min((unsigned)Sequence.size(),
	HazardRec->getMaxLookAhead());
	if (LookAhead == 0)
	return;

	std::vector<SUnit *>::const_iterator I = (Sequence.end() - LookAhead);
	unsigned HazardCycle = (*I)->getHeight();
	for (auto E = Sequence.end(); I != E; ++I) {
	SUnit SU = I;
	for (; SU->getHeight() > HazardCycle; ++HazardCycle) {
	HazardRec->RecedeCycle();
	}
	EmitNode(SU);
	}
	}

	/// BacktrackBottomUp - Backtrack scheduling to a previous cycle specified in
	/// BTCycle in order to schedule a specific node.
	void ScheduleDAGRRList::BacktrackBottomUp(SUnit SU, SUnit BtSU) {
	SUnit *OldSU = Sequence.back();
	while (true) {
	Sequence.pop_back();
	// FIXME: use ready cycle instead of height
	CurCycle = OldSU->getHeight();
	UnscheduleNodeBottomUp(OldSU);
	AvailableQueue->setCurCycle(CurCycle);
	if (OldSU == BtSU)
	break;
	OldSU = Sequence.back();
	}

	assert(!SU->isSucc(OldSU) && "Something is wrong!");

	RestoreHazardCheckerBottomUp();

	ReleasePending();

	++NumBacktracks;
	}

	static bool isOperandOf(const SUnit SU, SDNode N) {
	for (const SDNode *SUNode = SU->getNode(); SUNode;
	SUNode = SUNode->getGluedNode()) {
	if (SUNode->isOperandOf(N))
	return true;
	}
	return false;
	}

	/// TryUnfold - Attempt to unfold
	SUnit ScheduleDAGRRList::TryUnfoldSU(SUnit SU) {
	SDNode *N = SU->getNode();
	// Use while over if to ease fall through.
	SmallVector<SDNode *, 2> NewNodes;
	if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
	return nullptr;

	// unfolding an x86 DEC64m operation results in store, dec, load which
	// can't be handled here so quit
	if (NewNodes.size() == 3)
	return nullptr;

	assert(NewNodes.size() == 2 && "Expected a load folding node!");

	N = NewNodes[1];
	SDNode *LoadNode = NewNodes[0];
	unsigned NumVals = N->getNumValues();
	unsigned OldNumVals = SU->getNode()->getNumValues();

	// LoadNode may already exist. This can happen when there is another
	// load from the same location and producing the same type of value
	// but it has different alignment or volatileness.
	bool isNewLoad = true;
	SUnit *LoadSU;
	if (LoadNode->getNodeId() != -1) {
	LoadSU = &SUnits[LoadNode->getNodeId()];
	// If LoadSU has already been scheduled, we should clone it but
	// this would negate the benefit to unfolding so just return SU.
	if (LoadSU->isScheduled)
	return SU;
	isNewLoad = false;
	} else {
	LoadSU = CreateNewSUnit(LoadNode);
	LoadNode->setNodeId(LoadSU->NodeNum);

	InitNumRegDefsLeft(LoadSU);
	computeLatency(LoadSU);
	}

	DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");

	// Now that we are committed to unfolding replace DAG Uses.
	for (unsigned i = 0; i != NumVals; ++i)
	DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
	DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals - 1),
	SDValue(LoadNode, 1));

	SUnit *NewSU = CreateNewSUnit(N);
	assert(N->getNodeId() == -1 && "Node already inserted!");
	N->setNodeId(NewSU->NodeNum);

	const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
	for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
	if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
	NewSU->isTwoAddress = true;
	break;
	}
	}
	if (MCID.isCommutable())
	NewSU->isCommutable = true;

	InitNumRegDefsLeft(NewSU);
	computeLatency(NewSU);

	// Record all the edges to and from the old SU, by category.
	SmallVector<SDep, 4> ChainPreds;
	SmallVector<SDep, 4> ChainSuccs;
	SmallVector<SDep, 4> LoadPreds;
	SmallVector<SDep, 4> NodePreds;
	SmallVector<SDep, 4> NodeSuccs;
	for (SDep &Pred : SU->Preds) {
	if (Pred.isCtrl())
	ChainPreds.push_back(Pred);
	else if (isOperandOf(Pred.getSUnit(), LoadNode))
	LoadPreds.push_back(Pred);
	else
	NodePreds.push_back(Pred);
	}
	for (SDep &Succ : SU->Succs) {
	if (Succ.isCtrl())
	ChainSuccs.push_back(Succ);
	else
	NodeSuccs.push_back(Succ);
	}

	// Now assign edges to the newly-created nodes.
	for (const SDep &Pred : ChainPreds) {
	RemovePred(SU, Pred);
	if (isNewLoad)
	AddPred(LoadSU, Pred);
	}
	for (const SDep &Pred : LoadPreds) {
	RemovePred(SU, Pred);
	if (isNewLoad)
	AddPred(LoadSU, Pred);
	}
	for (const SDep &Pred : NodePreds) {
	RemovePred(SU, Pred);
	AddPred(NewSU, Pred);
	}
	for (SDep D : NodeSuccs) {
	SUnit *SuccDep = D.getSUnit();
	D.setSUnit(SU);
	RemovePred(SuccDep, D);
	D.setSUnit(NewSU);
	AddPred(SuccDep, D);
	// Balance register pressure.
	if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled &&
	!D.isCtrl() && NewSU->NumRegDefsLeft > 0)
	--NewSU->NumRegDefsLeft;
	}
	for (SDep D : ChainSuccs) {
	SUnit *SuccDep = D.getSUnit();
	D.setSUnit(SU);
	RemovePred(SuccDep, D);
	if (isNewLoad) {
	D.setSUnit(LoadSU);
	AddPred(SuccDep, D);
	}
	}

	// Add a data dependency to reflect that NewSU reads the value defined
	// by LoadSU.
	SDep D(LoadSU, SDep::Data, 0);
	D.setLatency(LoadSU->Latency);
	AddPred(NewSU, D);

	if (isNewLoad)
	AvailableQueue->addNode(LoadSU);
	AvailableQueue->addNode(NewSU);

	++NumUnfolds;

	if (NewSU->NumSuccsLeft == 0)
	NewSU->isAvailable = true;

	return NewSU;
	}

	/// CopyAndMoveSuccessors - Clone the specified node and move its scheduled
	/// successors to the newly created node.
	SUnit ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit SU) {
	SDNode *N = SU->getNode();
	if (!N)
	return nullptr;

	- if (SU->getNode()->getGluedNode())
	+ DEBUG(dbgs() << "Considering duplicating the SU\n");
	+ DEBUG(SU->dump(this));
	+
	+ if (N->getGluedNode() &&
	+ !TII->canCopyGluedNodeDuringSchedule(N)) {
	+ DEBUG(dbgs()
	+ << "Giving up because it has incoming glue and the target does not "
	+ "want to copy it\n");
	return nullptr;
	+ }

	SUnit *NewSU;
	bool TryUnfold = false;
	for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
	MVT VT = N->getSimpleValueType(i);
	- if (VT == MVT::Glue)
	+ if (VT == MVT::Glue) {
	+ DEBUG(dbgs() << "Giving up because it has outgoing glue\n");
	return nullptr;
	- else if (VT == MVT::Other)
	+ } else if (VT == MVT::Other)
	TryUnfold = true;
	}
	for (const SDValue &Op : N->op_values()) {
	MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
	- if (VT == MVT::Glue)
	+ if (VT == MVT::Glue && !TII->canCopyGluedNodeDuringSchedule(N)) {
	+ DEBUG(dbgs() << "Giving up because it one of the operands is glue and "
	+ "the target does not want to copy it\n");
	return nullptr;
	+ }
	}

	// If possible unfold instruction.
	if (TryUnfold) {
	SUnit *UnfoldSU = TryUnfoldSU(SU);
	if (!UnfoldSU)
	return nullptr;
	SU = UnfoldSU;
	N = SU->getNode();
	// If this can be scheduled don't bother duplicating and just return
	if (SU->NumSuccsLeft == 0)
	return SU;
	}

	DEBUG(dbgs() << " Duplicating SU #" << SU->NodeNum << "\n");
	NewSU = CreateClone(SU);

	// New SUnit has the exact same predecessors.
	for (SDep &Pred : SU->Preds)
	if (!Pred.isArtificial())
	AddPred(NewSU, Pred);

	// Only copy scheduled successors. Cut them from old node's successor
	// list and move them over.
	SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
	for (SDep &Succ : SU->Succs) {
	if (Succ.isArtificial())
	continue;
	SUnit *SuccSU = Succ.getSUnit();
	if (SuccSU->isScheduled) {
	SDep D = Succ;
	D.setSUnit(NewSU);
	AddPred(SuccSU, D);
	D.setSUnit(SU);
	DelDeps.push_back(std::make_pair(SuccSU, D));
	}
	}
	for (auto &DelDep : DelDeps)
	RemovePred(DelDep.first, DelDep.second);

	AvailableQueue->updateNode(SU);
	AvailableQueue->addNode(NewSU);

	++NumDups;
	return NewSU;
	}

	/// InsertCopiesAndMoveSuccs - Insert register copies and move all
	/// scheduled successors of the given SUnit to the last copy.
	void ScheduleDAGRRList::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
	const TargetRegisterClass *DestRC,
	const TargetRegisterClass *SrcRC,
	SmallVectorImpl<SUnit*> &Copies) {
	SUnit *CopyFromSU = CreateNewSUnit(nullptr);
	CopyFromSU->CopySrcRC = SrcRC;
	CopyFromSU->CopyDstRC = DestRC;

	SUnit *CopyToSU = CreateNewSUnit(nullptr);
	CopyToSU->CopySrcRC = DestRC;
	CopyToSU->CopyDstRC = SrcRC;

	// Only copy scheduled successors. Cut them from old node's successor
	// list and move them over.
	SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
	for (SDep &Succ : SU->Succs) {
	if (Succ.isArtificial())
	continue;
	SUnit *SuccSU = Succ.getSUnit();
	if (SuccSU->isScheduled) {
	SDep D = Succ;
	D.setSUnit(CopyToSU);
	AddPred(SuccSU, D);
	DelDeps.push_back(std::make_pair(SuccSU, Succ));
	}
	else {
	// Avoid scheduling the def-side copy before other successors. Otherwise
	// we could introduce another physreg interference on the copy and
	// continue inserting copies indefinitely.
	AddPred(SuccSU, SDep(CopyFromSU, SDep::Artificial));
	}
	}
	for (auto &DelDep : DelDeps)
	RemovePred(DelDep.first, DelDep.second);

	SDep FromDep(SU, SDep::Data, Reg);
	FromDep.setLatency(SU->Latency);
	AddPred(CopyFromSU, FromDep);
	SDep ToDep(CopyFromSU, SDep::Data, 0);
	ToDep.setLatency(CopyFromSU->Latency);
	AddPred(CopyToSU, ToDep);

	AvailableQueue->updateNode(SU);
	AvailableQueue->addNode(CopyFromSU);
	AvailableQueue->addNode(CopyToSU);
	Copies.push_back(CopyFromSU);
	Copies.push_back(CopyToSU);

	++NumPRCopies;
	}

	/// getPhysicalRegisterVT - Returns the ValueType of the physical register
	/// definition of the specified node.
	/// FIXME: Move to SelectionDAG?
	static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
	const TargetInstrInfo *TII) {
	unsigned NumRes;
	if (N->getOpcode() == ISD::CopyFromReg) {
	// CopyFromReg has: "chain, Val, glue" so operand 1 gives the type.
	NumRes = 1;
	} else {
	const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
	assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
	NumRes = MCID.getNumDefs();
	for (const MCPhysReg ImpDef = MCID.getImplicitDefs(); ImpDef; ++ImpDef) {
	if (Reg == *ImpDef)
	break;
	++NumRes;
	}
	}
	return N->getSimpleValueType(NumRes);
	}

	/// CheckForLiveRegDef - Return true and update live register vector if the
	/// specified register def of the specified SUnit clobbers any "live" registers.
	static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
	SUnit **LiveRegDefs,
	SmallSet<unsigned, 4> &RegAdded,
	SmallVectorImpl<unsigned> &LRegs,
	const TargetRegisterInfo *TRI) {
	for (MCRegAliasIterator AliasI(Reg, TRI, true); AliasI.isValid(); ++AliasI) {

	// Check if Ref is live.
	if (!LiveRegDefs[*AliasI]) continue;

	// Allow multiple uses of the same def.
	if (LiveRegDefs[*AliasI] == SU) continue;

	// Add Reg to the set of interfering live regs.
	if (RegAdded.insert(*AliasI).second) {
	LRegs.push_back(*AliasI);
	}
	}
	}

	/// CheckForLiveRegDefMasked - Check for any live physregs that are clobbered
	/// by RegMask, and add them to LRegs.
	static void CheckForLiveRegDefMasked(SUnit SU, const uint32_t RegMask,
	ArrayRef<SUnit*> LiveRegDefs,
	SmallSet<unsigned, 4> &RegAdded,
	SmallVectorImpl<unsigned> &LRegs) {
	// Look at all live registers. Skip Reg0 and the special CallResource.
	for (unsigned i = 1, e = LiveRegDefs.size()-1; i != e; ++i) {
	if (!LiveRegDefs[i]) continue;
	if (LiveRegDefs[i] == SU) continue;
	if (!MachineOperand::clobbersPhysReg(RegMask, i)) continue;
	if (RegAdded.insert(i).second)
	LRegs.push_back(i);
	}
	}

	/// getNodeRegMask - Returns the register mask attached to an SDNode, if any.
	static const uint32_t getNodeRegMask(const SDNode N) {
	for (const SDValue &Op : N->op_values())
	if (const auto *RegOp = dyn_cast<RegisterMaskSDNode>(Op.getNode()))
	return RegOp->getRegMask();
	return nullptr;
	}

	/// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay
	/// scheduling of the given node to satisfy live physical register dependencies.
	/// If the specific node is the last one that's available to schedule, do
	/// whatever is necessary (i.e. backtracking or cloning) to make it possible.
	bool ScheduleDAGRRList::
	DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
	if (NumLiveRegs == 0)
	return false;

	SmallSet<unsigned, 4> RegAdded;
	// If this node would clobber any "live" register, then it's not ready.
	//
	// If SU is the currently live definition of the same register that it uses,
	// then we are free to schedule it.
	for (SDep &Pred : SU->Preds) {
	if (Pred.isAssignedRegDep() && LiveRegDefs[Pred.getReg()] != SU)
	CheckForLiveRegDef(Pred.getSUnit(), Pred.getReg(), LiveRegDefs.get(),
	RegAdded, LRegs, TRI);
	}

	for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode()) {
	if (Node->getOpcode() == ISD::INLINEASM) {
	// Inline asm can clobber physical defs.
	unsigned NumOps = Node->getNumOperands();
	if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
	--NumOps; // Ignore the glue operand.

	for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
	unsigned Flags =
	cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
	unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);

	++i; // Skip the ID value.
	if (InlineAsm::isRegDefKind(Flags) \|\|
	InlineAsm::isRegDefEarlyClobberKind(Flags) \|\|
	InlineAsm::isClobberKind(Flags)) {
	// Check for def of register or earlyclobber register.
	for (; NumVals; --NumVals, ++i) {
	unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
	if (TargetRegisterInfo::isPhysicalRegister(Reg))
	CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
	}
	} else
	i += NumVals;
	}
	continue;
	}

	if (!Node->isMachineOpcode())
	continue;
	// If we're in the middle of scheduling a call, don't begin scheduling
	// another call. Also, don't allow any physical registers to be live across
	// the call.
	if (Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
	// Check the special calling-sequence resource.
	unsigned CallResource = TRI->getNumRegs();
	if (LiveRegDefs[CallResource]) {
	SDNode *Gen = LiveRegGens[CallResource]->getNode();
	while (SDNode *Glued = Gen->getGluedNode())
	Gen = Glued;
	if (!IsChainDependent(Gen, Node, 0, TII) &&
	RegAdded.insert(CallResource).second)
	LRegs.push_back(CallResource);
	}
	}
	if (const uint32_t *RegMask = getNodeRegMask(Node))
	CheckForLiveRegDefMasked(SU, RegMask,
	makeArrayRef(LiveRegDefs.get(), TRI->getNumRegs()),
	RegAdded, LRegs);

	const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
	if (MCID.hasOptionalDef()) {
	// Most ARM instructions have an OptionalDef for CPSR, to model the S-bit.
	// This operand can be either a def of CPSR, if the S bit is set; or a use
	// of %noreg. When the OptionalDef is set to a valid register, we need to
	// handle it in the same way as an ImplicitDef.
	for (unsigned i = 0; i < MCID.getNumDefs(); ++i)
	if (MCID.OpInfo[i].isOptionalDef()) {
	const SDValue &OptionalDef = Node->getOperand(i - Node->getNumValues());
	unsigned Reg = cast<RegisterSDNode>(OptionalDef)->getReg();
	CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
	}
	}
	if (!MCID.ImplicitDefs)
	continue;
	for (const MCPhysReg Reg = MCID.getImplicitDefs(); Reg; ++Reg)
	CheckForLiveRegDef(SU, *Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
	}

	return !LRegs.empty();
	}

	void ScheduleDAGRRList::releaseInterferences(unsigned Reg) {
	// Add the nodes that aren't ready back onto the available list.
	for (unsigned i = Interferences.size(); i > 0; --i) {
	SUnit *SU = Interferences[i-1];
	LRegsMapT::iterator LRegsPos = LRegsMap.find(SU);
	if (Reg) {
	SmallVectorImpl<unsigned> &LRegs = LRegsPos->second;
	if (!is_contained(LRegs, Reg))
	continue;
	}
	SU->isPending = false;
	// The interfering node may no longer be available due to backtracking.
	// Furthermore, it may have been made available again, in which case it is
	// now already in the AvailableQueue.
	if (SU->isAvailable && !SU->NodeQueueId) {
	DEBUG(dbgs() << " Repushing SU #" << SU->NodeNum << '\n');
	AvailableQueue->push(SU);
	}
	if (i < Interferences.size())
	Interferences[i-1] = Interferences.back();
	Interferences.pop_back();
	LRegsMap.erase(LRegsPos);
	}
	}

	/// Return a node that can be scheduled in this cycle. Requirements:
	/// (1) Ready: latency has been satisfied
	/// (2) No Hazards: resources are available
	/// (3) No Interferences: may unschedule to break register interferences.
	SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
	SUnit *CurSU = AvailableQueue->empty() ? nullptr : AvailableQueue->pop();
	auto FindAvailableNode = [&]() {
	while (CurSU) {
	SmallVector<unsigned, 4> LRegs;
	if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
	break;
	DEBUG(dbgs() << " Interfering reg ";
	if (LRegs[0] == TRI->getNumRegs())
	dbgs() << "CallResource";
	else
	dbgs() << printReg(LRegs[0], TRI);
	dbgs() << " SU #" << CurSU->NodeNum << '\n');
	std::pair<LRegsMapT::iterator, bool> LRegsPair =
	LRegsMap.insert(std::make_pair(CurSU, LRegs));
	if (LRegsPair.second) {
	CurSU->isPending = true; // This SU is not in AvailableQueue right now.
	Interferences.push_back(CurSU);
	}
	else {
	assert(CurSU->isPending && "Interferences are pending");
	// Update the interference with current live regs.
	LRegsPair.first->second = LRegs;
	}
	CurSU = AvailableQueue->pop();
	}
	};
	FindAvailableNode();
	if (CurSU)
	return CurSU;

	// All candidates are delayed due to live physical reg dependencies.
	// Try backtracking, code duplication, or inserting cross class copies
	// to resolve it.
	for (SUnit *TrySU : Interferences) {
	SmallVectorImpl<unsigned> &LRegs = LRegsMap[TrySU];

	// Try unscheduling up to the point where it's safe to schedule
	// this node.
	SUnit *BtSU = nullptr;
	unsigned LiveCycle = std::numeric_limits<unsigned>::max();
	for (unsigned Reg : LRegs) {
	if (LiveRegGens[Reg]->getHeight() < LiveCycle) {
	BtSU = LiveRegGens[Reg];
	LiveCycle = BtSU->getHeight();
	}
	}
	if (!WillCreateCycle(TrySU, BtSU)) {
	// BacktrackBottomUp mutates Interferences!
	BacktrackBottomUp(TrySU, BtSU);

	// Force the current node to be scheduled before the node that
	// requires the physical reg dep.
	if (BtSU->isAvailable) {
	BtSU->isAvailable = false;
	if (!BtSU->isPending)
	AvailableQueue->remove(BtSU);
	}
	DEBUG(dbgs() << "ARTIFICIAL edge from SU(" << BtSU->NodeNum << ") to SU("
	<< TrySU->NodeNum << ")\n");
	AddPred(TrySU, SDep(BtSU, SDep::Artificial));

	// If one or more successors has been unscheduled, then the current
	// node is no longer available.
	if (!TrySU->isAvailable \|\| !TrySU->NodeQueueId) {
	DEBUG(dbgs() << "TrySU not available; choosing node from queue\n");
	CurSU = AvailableQueue->pop();
	} else {
	DEBUG(dbgs() << "TrySU available\n");
	// Available and in AvailableQueue
	AvailableQueue->remove(TrySU);
	CurSU = TrySU;
	}
	FindAvailableNode();
	// Interferences has been mutated. We must break.
	break;
	}
	}

	if (!CurSU) {
	// Can't backtrack. If it's too expensive to copy the value, then try
	// duplicate the nodes that produces these "too expensive to copy"
	// values to break the dependency. In case even that doesn't work,
	// insert cross class copies.
	// If it's not too expensive, i.e. cost != -1, issue copies.
	SUnit *TrySU = Interferences[0];
	SmallVectorImpl<unsigned> &LRegs = LRegsMap[TrySU];
	assert(LRegs.size() == 1 && "Can't handle this yet!");
	unsigned Reg = LRegs[0];
	SUnit *LRDef = LiveRegDefs[Reg];
	MVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
	const TargetRegisterClass *RC =
	TRI->getMinimalPhysRegClass(Reg, VT);
	const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);

	// If cross copy register class is the same as RC, then it must be possible
	// copy the value directly. Do not try duplicate the def.
	// If cross copy register class is not the same as RC, then it's possible to
	// copy the value but it require cross register class copies and it is
	// expensive.
	// If cross copy register class is null, then it's not possible to copy
	// the value at all.
	SUnit *NewDef = nullptr;
	if (DestRC != RC) {
	NewDef = CopyAndMoveSuccessors(LRDef);
	if (!DestRC && !NewDef)
	report_fatal_error("Can't handle live physical register dependency!");
	}
	if (!NewDef) {
	// Issue copies, these can be expensive cross register class copies.
	SmallVector<SUnit*, 2> Copies;
	InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
	DEBUG(dbgs() << " Adding an edge from SU #" << TrySU->NodeNum
	<< " to SU #" << Copies.front()->NodeNum << "\n");
	AddPred(TrySU, SDep(Copies.front(), SDep::Artificial));
	NewDef = Copies.back();
	}

	DEBUG(dbgs() << " Adding an edge from SU #" << NewDef->NodeNum
	<< " to SU #" << TrySU->NodeNum << "\n");
	LiveRegDefs[Reg] = NewDef;
	AddPred(NewDef, SDep(TrySU, SDep::Artificial));
	TrySU->isAvailable = false;
	CurSU = NewDef;
	}
	assert(CurSU && "Unable to resolve live physical register dependencies!");
	return CurSU;
	}

	/// ListScheduleBottomUp - The main loop of list scheduling for bottom-up
	/// schedulers.
	void ScheduleDAGRRList::ListScheduleBottomUp() {
	// Release any predecessors of the special Exit node.
	ReleasePredecessors(&ExitSU);

	// Add root to Available queue.
	if (!SUnits.empty()) {
	SUnit *RootSU = &SUnits[DAG->getRoot().getNode()->getNodeId()];
	assert(RootSU->Succs.empty() && "Graph root shouldn't have successors!");
	RootSU->isAvailable = true;
	AvailableQueue->push(RootSU);
	}

	// While Available queue is not empty, grab the node with the highest
	// priority. If it is not ready put it back. Schedule the node.
	Sequence.reserve(SUnits.size());
	while (!AvailableQueue->empty() \|\| !Interferences.empty()) {
	DEBUG(dbgs() << "\nExamining Available:\n";
	AvailableQueue->dump(this));

	// Pick the best node to schedule taking all constraints into
	// consideration.
	SUnit *SU = PickNodeToScheduleBottomUp();

	AdvancePastStalls(SU);

	ScheduleNodeBottomUp(SU);

	while (AvailableQueue->empty() && !PendingQueue.empty()) {
	// Advance the cycle to free resources. Skip ahead to the next ready SU.
	assert(MinAvailableCycle < std::numeric_limits<unsigned>::max() &&
	"MinAvailableCycle uninitialized");
	AdvanceToCycle(std::max(CurCycle + 1, MinAvailableCycle));
	}
	}

	// Reverse the order if it is bottom up.
	std::reverse(Sequence.begin(), Sequence.end());

	#ifndef NDEBUG
	VerifyScheduledSequence(/isBottomUp=/true);
	#endif
	}

	namespace {

	class RegReductionPQBase;

	struct queue_sort {
	bool isReady(SUnit* SU, unsigned CurCycle) const { return true; }
	};

	#ifndef NDEBUG
	template<class SF>
	struct reverse_sort : public queue_sort {
	SF &SortFunc;

	reverse_sort(SF &sf) : SortFunc(sf) {}

	bool operator()(SUnit* left, SUnit* right) const {
	// reverse left/right rather than simply !SortFunc(left, right)
	// to expose different paths in the comparison logic.
	return SortFunc(right, left);
	}
	};
	#endif // NDEBUG

	/// bu_ls_rr_sort - Priority function for bottom up register pressure
	// reduction scheduler.
	struct bu_ls_rr_sort : public queue_sort {
	enum {
	IsBottomUp = true,
	HasReadyFilter = false
	};

	RegReductionPQBase *SPQ;

	bu_ls_rr_sort(RegReductionPQBase *spq) : SPQ(spq) {}

	bool operator()(SUnit* left, SUnit* right) const;
	};

	// src_ls_rr_sort - Priority function for source order scheduler.
	struct src_ls_rr_sort : public queue_sort {
	enum {
	IsBottomUp = true,
	HasReadyFilter = false
	};

	RegReductionPQBase *SPQ;

	src_ls_rr_sort(RegReductionPQBase *spq) : SPQ(spq) {}

	bool operator()(SUnit* left, SUnit* right) const;
	};

	// hybrid_ls_rr_sort - Priority function for hybrid scheduler.
	struct hybrid_ls_rr_sort : public queue_sort {
	enum {
	IsBottomUp = true,
	HasReadyFilter = false
	};

	RegReductionPQBase *SPQ;

	hybrid_ls_rr_sort(RegReductionPQBase *spq) : SPQ(spq) {}

	bool isReady(SUnit *SU, unsigned CurCycle) const;

	bool operator()(SUnit* left, SUnit* right) const;
	};

	// ilp_ls_rr_sort - Priority function for ILP (instruction level parallelism)
	// scheduler.
	struct ilp_ls_rr_sort : public queue_sort {
	enum {
	IsBottomUp = true,
	HasReadyFilter = false
	};

	RegReductionPQBase *SPQ;

	ilp_ls_rr_sort(RegReductionPQBase *spq) : SPQ(spq) {}

	bool isReady(SUnit *SU, unsigned CurCycle) const;

	bool operator()(SUnit* left, SUnit* right) const;
	};

	class RegReductionPQBase : public SchedulingPriorityQueue {
	protected:
	std::vector<SUnit *> Queue;
	unsigned CurQueueId = 0;
	bool TracksRegPressure;
	bool SrcOrder;

	// SUnits - The SUnits for the current graph.
	std::vector<SUnit> *SUnits;

	MachineFunction &MF;
	const TargetInstrInfo *TII;
	const TargetRegisterInfo *TRI;
	const TargetLowering *TLI;
	ScheduleDAGRRList *scheduleDAG = nullptr;

	// SethiUllmanNumbers - The SethiUllman number for each node.
	std::vector<unsigned> SethiUllmanNumbers;

	/// RegPressure - Tracking current reg pressure per register class.
	std::vector<unsigned> RegPressure;

	/// RegLimit - Tracking the number of allocatable registers per register
	/// class.
	std::vector<unsigned> RegLimit;

	public:
	RegReductionPQBase(MachineFunction &mf,
	bool hasReadyFilter,
	bool tracksrp,
	bool srcorder,
	const TargetInstrInfo *tii,
	const TargetRegisterInfo *tri,
	const TargetLowering *tli)
	: SchedulingPriorityQueue(hasReadyFilter), TracksRegPressure(tracksrp),
	SrcOrder(srcorder), MF(mf), TII(tii), TRI(tri), TLI(tli) {
	if (TracksRegPressure) {
	unsigned NumRC = TRI->getNumRegClasses();
	RegLimit.resize(NumRC);
	RegPressure.resize(NumRC);
	std::fill(RegLimit.begin(), RegLimit.end(), 0);
	std::fill(RegPressure.begin(), RegPressure.end(), 0);
	for (const TargetRegisterClass *RC : TRI->regclasses())
	RegLimit[RC->getID()] = tri->getRegPressureLimit(RC, MF);
	}
	}

	void setScheduleDAG(ScheduleDAGRRList *scheduleDag) {
	scheduleDAG = scheduleDag;
	}

	ScheduleHazardRecognizer* getHazardRec() {
	return scheduleDAG->getHazardRec();
	}

	void initNodes(std::vector<SUnit> &sunits) override;

	void addNode(const SUnit *SU) override;

	void updateNode(const SUnit *SU) override;

	void releaseState() override {
	SUnits = nullptr;
	SethiUllmanNumbers.clear();
	std::fill(RegPressure.begin(), RegPressure.end(), 0);
	}

	unsigned getNodePriority(const SUnit *SU) const;

	unsigned getNodeOrdering(const SUnit *SU) const {
	if (!SU->getNode()) return 0;

	return SU->getNode()->getIROrder();
	}

	bool empty() const override { return Queue.empty(); }

	void push(SUnit *U) override {
	assert(!U->NodeQueueId && "Node in the queue already");
	U->NodeQueueId = ++CurQueueId;
	Queue.push_back(U);
	}

	void remove(SUnit *SU) override {
	assert(!Queue.empty() && "Queue is empty!");
	assert(SU->NodeQueueId != 0 && "Not in queue!");
	std::vector<SUnit *>::iterator I = llvm::find(Queue, SU);
	if (I != std::prev(Queue.end()))
	std::swap(*I, Queue.back());
	Queue.pop_back();
	SU->NodeQueueId = 0;
	}

	bool tracksRegPressure() const override { return TracksRegPressure; }

	void dumpRegPressure() const;

	bool HighRegPressure(const SUnit *SU) const;

	bool MayReduceRegPressure(SUnit *SU) const;

	int RegPressureDiff(SUnit *SU, unsigned &LiveUses) const;

	void scheduledNode(SUnit *SU) override;

	void unscheduledNode(SUnit *SU) override;

	protected:
	bool canClobber(const SUnit SU, const SUnit Op);
	void AddPseudoTwoAddrDeps();
	void PrescheduleNodesWithMultipleUses();
	void CalculateSethiUllmanNumbers();
	};

	template<class SF>
	static SUnit popFromQueueImpl(std::vector<SUnit > &Q, SF &Picker) {
	std::vector<SUnit *>::iterator Best = Q.begin();
	for (auto I = std::next(Q.begin()), E = Q.end(); I != E; ++I)
	if (Picker(Best, I))
	Best = I;
	SUnit V = Best;
	if (Best != std::prev(Q.end()))
	std::swap(*Best, Q.back());
	Q.pop_back();
	return V;
	}

	template<class SF>
	SUnit popFromQueue(std::vector<SUnit > &Q, SF &Picker, ScheduleDAG *DAG) {
	#ifndef NDEBUG
	if (DAG->StressSched) {
	reverse_sort<SF> RPicker(Picker);
	return popFromQueueImpl(Q, RPicker);
	}
	#endif
	(void)DAG;
	return popFromQueueImpl(Q, Picker);
	}

	//===----------------------------------------------------------------------===//
	// RegReductionPriorityQueue Definition
	//===----------------------------------------------------------------------===//
	//
	// This is a SchedulingPriorityQueue that schedules using Sethi Ullman numbers
	// to reduce register pressure.
	//
	template<class SF>
	class RegReductionPriorityQueue : public RegReductionPQBase {
	SF Picker;

	public:
	RegReductionPriorityQueue(MachineFunction &mf,
	bool tracksrp,
	bool srcorder,
	const TargetInstrInfo *tii,
	const TargetRegisterInfo *tri,
	const TargetLowering *tli)
	: RegReductionPQBase(mf, SF::HasReadyFilter, tracksrp, srcorder,
	tii, tri, tli),
	Picker(this) {}

	bool isBottomUp() const override { return SF::IsBottomUp; }

	bool isReady(SUnit *U) const override {
	return Picker.HasReadyFilter && Picker.isReady(U, getCurCycle());
	}

	SUnit *pop() override {
	if (Queue.empty()) return nullptr;

	SUnit *V = popFromQueue(Queue, Picker, scheduleDAG);
	V->NodeQueueId = 0;
	return V;
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void dump(ScheduleDAG *DAG) const override {
	// Emulate pop() without clobbering NodeQueueIds.
	std::vector<SUnit *> DumpQueue = Queue;
	SF DumpPicker = Picker;
	while (!DumpQueue.empty()) {
	SUnit *SU = popFromQueue(DumpQueue, DumpPicker, scheduleDAG);
	dbgs() << "Height " << SU->getHeight() << ": ";
	SU->dump(DAG);
	}
	}
	#endif
	};

	using BURegReductionPriorityQueue = RegReductionPriorityQueue<bu_ls_rr_sort>;
	using SrcRegReductionPriorityQueue = RegReductionPriorityQueue<src_ls_rr_sort>;
	using HybridBURRPriorityQueue = RegReductionPriorityQueue<hybrid_ls_rr_sort>;
	using ILPBURRPriorityQueue = RegReductionPriorityQueue<ilp_ls_rr_sort>;

	} // end anonymous namespace

	//===----------------------------------------------------------------------===//
	// Static Node Priority for Register Pressure Reduction
	//===----------------------------------------------------------------------===//

	// Check for special nodes that bypass scheduling heuristics.
	// Currently this pushes TokenFactor nodes down, but may be used for other
	// pseudo-ops as well.
	//
	// Return -1 to schedule right above left, 1 for left above right.
	// Return 0 if no bias exists.
	static int checkSpecialNodes(const SUnit left, const SUnit right) {
	bool LSchedLow = left->isScheduleLow;
	bool RSchedLow = right->isScheduleLow;
	if (LSchedLow != RSchedLow)
	return LSchedLow < RSchedLow ? 1 : -1;
	return 0;
	}

	/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
	/// Smaller number is the higher priority.
	static unsigned
	CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
	if (SUNumbers[SU->NodeNum] != 0)
	return SUNumbers[SU->NodeNum];

	// Use WorkList to avoid stack overflow on excessively large IRs.
	struct WorkState {
	WorkState(const SUnit *SU) : SU(SU) {}
	const SUnit *SU;
	unsigned PredsProcessed = 0;
	};

	SmallVector<WorkState, 16> WorkList;
	WorkList.push_back(SU);
	while (!WorkList.empty()) {
	auto &Temp = WorkList.back();
	auto *TempSU = Temp.SU;
	bool AllPredsKnown = true;
	// Try to find a non-evaluated pred and push it into the processing stack.
	for (unsigned P = Temp.PredsProcessed; P < TempSU->Preds.size(); ++P) {
	auto &Pred = TempSU->Preds[P];
	if (Pred.isCtrl()) continue; // ignore chain preds
	SUnit *PredSU = Pred.getSUnit();
	if (SUNumbers[PredSU->NodeNum] == 0) {
	#ifndef NDEBUG
	// In debug mode, check that we don't have such element in the stack.
	for (auto It : WorkList)
	assert(It.SU != PredSU && "Trying to push an element twice?");
	#endif
	// Next time start processing this one starting from the next pred.
	Temp.PredsProcessed = P + 1;
	WorkList.push_back(PredSU);
	AllPredsKnown = false;
	break;
	}
	}

	if (!AllPredsKnown)
	continue;

	// Once all preds are known, we can calculate the answer for this one.
	unsigned SethiUllmanNumber = 0;
	unsigned Extra = 0;
	for (const SDep &Pred : TempSU->Preds) {
	if (Pred.isCtrl()) continue; // ignore chain preds
	SUnit *PredSU = Pred.getSUnit();
	unsigned PredSethiUllman = SUNumbers[PredSU->NodeNum];
	assert(PredSethiUllman > 0 && "We should have evaluated this pred!");
	if (PredSethiUllman > SethiUllmanNumber) {
	SethiUllmanNumber = PredSethiUllman;
	Extra = 0;
	} else if (PredSethiUllman == SethiUllmanNumber)
	++Extra;
	}

	SethiUllmanNumber += Extra;
	if (SethiUllmanNumber == 0)
	SethiUllmanNumber = 1;
	SUNumbers[TempSU->NodeNum] = SethiUllmanNumber;
	WorkList.pop_back();
	}

	assert(SUNumbers[SU->NodeNum] > 0 && "SethiUllman should never be zero!");
	return SUNumbers[SU->NodeNum];
	}

	/// CalculateSethiUllmanNumbers - Calculate Sethi-Ullman numbers of all
	/// scheduling units.
	void RegReductionPQBase::CalculateSethiUllmanNumbers() {
	SethiUllmanNumbers.assign(SUnits->size(), 0);

	for (const SUnit &SU : *SUnits)
	CalcNodeSethiUllmanNumber(&SU, SethiUllmanNumbers);
	}

	void RegReductionPQBase::addNode(const SUnit *SU) {
	unsigned SUSize = SethiUllmanNumbers.size();
	if (SUnits->size() > SUSize)
	SethiUllmanNumbers.resize(SUSize*2, 0);
	CalcNodeSethiUllmanNumber(SU, SethiUllmanNumbers);
	}

	void RegReductionPQBase::updateNode(const SUnit *SU) {
	SethiUllmanNumbers[SU->NodeNum] = 0;
	CalcNodeSethiUllmanNumber(SU, SethiUllmanNumbers);
	}

	// Lower priority means schedule further down. For bottom-up scheduling, lower
	// priority SUs are scheduled before higher priority SUs.
	unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const {
	assert(SU->NodeNum < SethiUllmanNumbers.size());
	unsigned Opc = SU->getNode() ? SU->getNode()->getOpcode() : 0;
	if (Opc == ISD::TokenFactor \|\| Opc == ISD::CopyToReg)
	// CopyToReg should be close to its uses to facilitate coalescing and
	// avoid spilling.
	return 0;
	if (Opc == TargetOpcode::EXTRACT_SUBREG \|\|
	Opc == TargetOpcode::SUBREG_TO_REG \|\|
	Opc == TargetOpcode::INSERT_SUBREG)
	// EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG nodes should be
	// close to their uses to facilitate coalescing.
	return 0;
	if (SU->NumSuccs == 0 && SU->NumPreds != 0)
	// If SU does not have a register use, i.e. it doesn't produce a value
	// that would be consumed (e.g. store), then it terminates a chain of
	// computation. Give it a large SethiUllman number so it will be
	// scheduled right before its predecessors that it doesn't lengthen
	// their live ranges.
	return 0xffff;
	if (SU->NumPreds == 0 && SU->NumSuccs != 0)
	// If SU does not have a register def, schedule it close to its uses
	// because it does not lengthen any live ranges.
	return 0;
	#if 1
	return SethiUllmanNumbers[SU->NodeNum];
	#else
	unsigned Priority = SethiUllmanNumbers[SU->NodeNum];
	if (SU->isCallOp) {
	// FIXME: This assumes all of the defs are used as call operands.
	int NP = (int)Priority - SU->getNode()->getNumValues();
	return (NP > 0) ? NP : 0;
	}
	return Priority;
	#endif
	}

	//===----------------------------------------------------------------------===//
	// Register Pressure Tracking
	//===----------------------------------------------------------------------===//

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void RegReductionPQBase::dumpRegPressure() const {
	for (const TargetRegisterClass *RC : TRI->regclasses()) {
	unsigned Id = RC->getID();
	unsigned RP = RegPressure[Id];
	if (!RP) continue;
	DEBUG(dbgs() << TRI->getRegClassName(RC) << ": " << RP << " / "
	<< RegLimit[Id] << '\n');
	}
	}
	#endif

	bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
	if (!TLI)
	return false;

	for (const SDep &Pred : SU->Preds) {
	if (Pred.isCtrl())
	continue;
	SUnit *PredSU = Pred.getSUnit();
	// NumRegDefsLeft is zero when enough uses of this node have been scheduled
	// to cover the number of registers defined (they are all live).
	if (PredSU->NumRegDefsLeft == 0) {
	continue;
	}
	for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
	RegDefPos.IsValid(); RegDefPos.Advance()) {
	unsigned RCId, Cost;
	GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);

	if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
	return true;
	}
	}
	return false;
	}

	bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) const {
	const SDNode *N = SU->getNode();

	if (!N->isMachineOpcode() \|\| !SU->NumSuccs)
	return false;

	unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
	for (unsigned i = 0; i != NumDefs; ++i) {
	MVT VT = N->getSimpleValueType(i);
	if (!N->hasAnyUseOfValue(i))
	continue;
	unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
	if (RegPressure[RCId] >= RegLimit[RCId])
	return true;
	}
	return false;
	}

	// Compute the register pressure contribution by this instruction by count up
	// for uses that are not live and down for defs. Only count register classes
	// that are already under high pressure. As a side effect, compute the number of
	// uses of registers that are already live.
	//
	// FIXME: This encompasses the logic in HighRegPressure and MayReduceRegPressure
	// so could probably be factored.
	int RegReductionPQBase::RegPressureDiff(SUnit *SU, unsigned &LiveUses) const {
	LiveUses = 0;
	int PDiff = 0;
	for (const SDep &Pred : SU->Preds) {
	if (Pred.isCtrl())
	continue;
	SUnit *PredSU = Pred.getSUnit();
	// NumRegDefsLeft is zero when enough uses of this node have been scheduled
	// to cover the number of registers defined (they are all live).
	if (PredSU->NumRegDefsLeft == 0) {
	if (PredSU->getNode()->isMachineOpcode())
	++LiveUses;
	continue;
	}
	for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
	RegDefPos.IsValid(); RegDefPos.Advance()) {
	MVT VT = RegDefPos.GetValue();
	unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
	if (RegPressure[RCId] >= RegLimit[RCId])
	++PDiff;
	}
	}
	const SDNode *N = SU->getNode();

	if (!N \|\| !N->isMachineOpcode() \|\| !SU->NumSuccs)
	return PDiff;

	unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
	for (unsigned i = 0; i != NumDefs; ++i) {
	MVT VT = N->getSimpleValueType(i);
	if (!N->hasAnyUseOfValue(i))
	continue;
	unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
	if (RegPressure[RCId] >= RegLimit[RCId])
	--PDiff;
	}
	return PDiff;
	}

	void RegReductionPQBase::scheduledNode(SUnit *SU) {
	if (!TracksRegPressure)
	return;

	if (!SU->getNode())
	return;

	for (const SDep &Pred : SU->Preds) {
	if (Pred.isCtrl())
	continue;
	SUnit *PredSU = Pred.getSUnit();
	// NumRegDefsLeft is zero when enough uses of this node have been scheduled
	// to cover the number of registers defined (they are all live).
	if (PredSU->NumRegDefsLeft == 0) {
	continue;
	}
	// FIXME: The ScheduleDAG currently loses information about which of a
	// node's values is consumed by each dependence. Consequently, if the node
	// defines multiple register classes, we don't know which to pressurize
	// here. Instead the following loop consumes the register defs in an
	// arbitrary order. At least it handles the common case of clustered loads
	// to the same class. For precise liveness, each SDep needs to indicate the
	// result number. But that tightly couples the ScheduleDAG with the
	// SelectionDAG making updates tricky. A simpler hack would be to attach a
	// value type or register class to SDep.
	//
	// The most important aspect of register tracking is balancing the increase
	// here with the reduction further below. Note that this SU may use multiple
	// defs in PredSU. The can't be determined here, but we've already
	// compensated by reducing NumRegDefsLeft in PredSU during
	// ScheduleDAGSDNodes::AddSchedEdges.
	--PredSU->NumRegDefsLeft;
	unsigned SkipRegDefs = PredSU->NumRegDefsLeft;
	for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
	RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) {
	if (SkipRegDefs)
	continue;

	unsigned RCId, Cost;
	GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
	RegPressure[RCId] += Cost;
	break;
	}
	}

	// We should have this assert, but there may be dead SDNodes that never
	// materialize as SUnits, so they don't appear to generate liveness.
	//assert(SU->NumRegDefsLeft == 0 && "not all regdefs have scheduled uses");
	int SkipRegDefs = (int)SU->NumRegDefsLeft;
	for (ScheduleDAGSDNodes::RegDefIter RegDefPos(SU, scheduleDAG);
	RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) {
	if (SkipRegDefs > 0)
	continue;
	unsigned RCId, Cost;
	GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
	if (RegPressure[RCId] < Cost) {
	// Register pressure tracking is imprecise. This can happen. But we try
	// hard not to let it happen because it likely results in poor scheduling.
	DEBUG(dbgs() << " SU(" << SU->NodeNum << ") has too many regdefs\n");
	RegPressure[RCId] = 0;
	}
	else {
	RegPressure[RCId] -= Cost;
	}
	}
	DEBUG(dumpRegPressure());
	}

	void RegReductionPQBase::unscheduledNode(SUnit *SU) {
	if (!TracksRegPressure)
	return;

	const SDNode *N = SU->getNode();
	if (!N) return;

	if (!N->isMachineOpcode()) {
	if (N->getOpcode() != ISD::CopyToReg)
	return;
	} else {
	unsigned Opc = N->getMachineOpcode();
	if (Opc == TargetOpcode::EXTRACT_SUBREG \|\|
	Opc == TargetOpcode::INSERT_SUBREG \|\|
	Opc == TargetOpcode::SUBREG_TO_REG \|\|
	Opc == TargetOpcode::REG_SEQUENCE \|\|
	Opc == TargetOpcode::IMPLICIT_DEF)
	return;
	}

	for (const SDep &Pred : SU->Preds) {
	if (Pred.isCtrl())
	continue;
	SUnit *PredSU = Pred.getSUnit();
	// NumSuccsLeft counts all deps. Don't compare it with NumSuccs which only
	// counts data deps.
	if (PredSU->NumSuccsLeft != PredSU->Succs.size())
	continue;
	const SDNode *PN = PredSU->getNode();
	if (!PN->isMachineOpcode()) {
	if (PN->getOpcode() == ISD::CopyFromReg) {
	MVT VT = PN->getSimpleValueType(0);
	unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
	RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
	}
	continue;
	}
	unsigned POpc = PN->getMachineOpcode();
	if (POpc == TargetOpcode::IMPLICIT_DEF)
	continue;
	if (POpc == TargetOpcode::EXTRACT_SUBREG \|\|
	POpc == TargetOpcode::INSERT_SUBREG \|\|
	POpc == TargetOpcode::SUBREG_TO_REG) {
	MVT VT = PN->getSimpleValueType(0);
	unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
	RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
	continue;
	}
	unsigned NumDefs = TII->get(PN->getMachineOpcode()).getNumDefs();
	for (unsigned i = 0; i != NumDefs; ++i) {
	MVT VT = PN->getSimpleValueType(i);
	if (!PN->hasAnyUseOfValue(i))
	continue;
	unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
	if (RegPressure[RCId] < TLI->getRepRegClassCostFor(VT))
	// Register pressure tracking is imprecise. This can happen.
	RegPressure[RCId] = 0;
	else
	RegPressure[RCId] -= TLI->getRepRegClassCostFor(VT);
	}
	}

	// Check for isMachineOpcode() as PrescheduleNodesWithMultipleUses()
	// may transfer data dependencies to CopyToReg.
	if (SU->NumSuccs && N->isMachineOpcode()) {
	unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
	for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
	MVT VT = N->getSimpleValueType(i);
	if (VT == MVT::Glue \|\| VT == MVT::Other)
	continue;
	if (!N->hasAnyUseOfValue(i))
	continue;
	unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
	RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
	}
	}

	DEBUG(dumpRegPressure());
	}

	//===----------------------------------------------------------------------===//
	// Dynamic Node Priority for Register Pressure Reduction
	//===----------------------------------------------------------------------===//

	/// closestSucc - Returns the scheduled cycle of the successor which is
	/// closest to the current cycle.
	static unsigned closestSucc(const SUnit *SU) {
	unsigned MaxHeight = 0;
	for (const SDep &Succ : SU->Succs) {
	if (Succ.isCtrl()) continue; // ignore chain succs
	unsigned Height = Succ.getSUnit()->getHeight();
	// If there are bunch of CopyToRegs stacked up, they should be considered
	// to be at the same position.
	if (Succ.getSUnit()->getNode() &&
	Succ.getSUnit()->getNode()->getOpcode() == ISD::CopyToReg)
	Height = closestSucc(Succ.getSUnit())+1;
	if (Height > MaxHeight)
	MaxHeight = Height;
	}
	return MaxHeight;
	}

	/// calcMaxScratches - Returns an cost estimate of the worse case requirement
	/// for scratch registers, i.e. number of data dependencies.
	static unsigned calcMaxScratches(const SUnit *SU) {
	unsigned Scratches = 0;
	for (const SDep &Pred : SU->Preds) {
	if (Pred.isCtrl()) continue; // ignore chain preds
	Scratches++;
	}
	return Scratches;
	}

	/// hasOnlyLiveInOpers - Return true if SU has only value predecessors that are
	/// CopyFromReg from a virtual register.
	static bool hasOnlyLiveInOpers(const SUnit *SU) {
	bool RetVal = false;
	for (const SDep &Pred : SU->Preds) {
	if (Pred.isCtrl()) continue;
	const SUnit *PredSU = Pred.getSUnit();
	if (PredSU->getNode() &&
	PredSU->getNode()->getOpcode() == ISD::CopyFromReg) {
	unsigned Reg =
	cast<RegisterSDNode>(PredSU->getNode()->getOperand(1))->getReg();
	if (TargetRegisterInfo::isVirtualRegister(Reg)) {
	RetVal = true;
	continue;
	}
	}
	return false;
	}
	return RetVal;
	}

	/// hasOnlyLiveOutUses - Return true if SU has only value successors that are
	/// CopyToReg to a virtual register. This SU def is probably a liveout and
	/// it has no other use. It should be scheduled closer to the terminator.
	static bool hasOnlyLiveOutUses(const SUnit *SU) {
	bool RetVal = false;
	for (const SDep &Succ : SU->Succs) {
	if (Succ.isCtrl()) continue;
	const SUnit *SuccSU = Succ.getSUnit();
	if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg) {
	unsigned Reg =
	cast<RegisterSDNode>(SuccSU->getNode()->getOperand(1))->getReg();
	if (TargetRegisterInfo::isVirtualRegister(Reg)) {
	RetVal = true;
	continue;
	}
	}
	return false;
	}
	return RetVal;
	}

	// Set isVRegCycle for a node with only live in opers and live out uses. Also
	// set isVRegCycle for its CopyFromReg operands.
	//
	// This is only relevant for single-block loops, in which case the VRegCycle
	// node is likely an induction variable in which the operand and target virtual
	// registers should be coalesced (e.g. pre/post increment values). Setting the
	// isVRegCycle flag helps the scheduler prioritize other uses of the same
	// CopyFromReg so that this node becomes the virtual register "kill". This
	// avoids interference between the values live in and out of the block and
	// eliminates a copy inside the loop.
	static void initVRegCycle(SUnit *SU) {
	if (DisableSchedVRegCycle)
	return;

	if (!hasOnlyLiveInOpers(SU) \|\| !hasOnlyLiveOutUses(SU))
	return;

	DEBUG(dbgs() << "VRegCycle: SU(" << SU->NodeNum << ")\n");

	SU->isVRegCycle = true;

	for (const SDep &Pred : SU->Preds) {
	if (Pred.isCtrl()) continue;
	Pred.getSUnit()->isVRegCycle = true;
	}
	}

	// After scheduling the definition of a VRegCycle, clear the isVRegCycle flag of
	// CopyFromReg operands. We should no longer penalize other uses of this VReg.
	static void resetVRegCycle(SUnit *SU) {
	if (!SU->isVRegCycle)
	return;

	for (const SDep &Pred : SU->Preds) {
	if (Pred.isCtrl()) continue; // ignore chain preds
	SUnit *PredSU = Pred.getSUnit();
	if (PredSU->isVRegCycle) {
	assert(PredSU->getNode()->getOpcode() == ISD::CopyFromReg &&
	"VRegCycle def must be CopyFromReg");
	Pred.getSUnit()->isVRegCycle = false;
	}
	}
	}

	// Return true if this SUnit uses a CopyFromReg node marked as a VRegCycle. This
	// means a node that defines the VRegCycle has not been scheduled yet.
	static bool hasVRegCycleUse(const SUnit *SU) {
	// If this SU also defines the VReg, don't hoist it as a "use".
	if (SU->isVRegCycle)
	return false;

	for (const SDep &Pred : SU->Preds) {
	if (Pred.isCtrl()) continue; // ignore chain preds
	if (Pred.getSUnit()->isVRegCycle &&
	Pred.getSUnit()->getNode()->getOpcode() == ISD::CopyFromReg) {
	DEBUG(dbgs() << " VReg cycle use: SU (" << SU->NodeNum << ")\n");
	return true;
	}
	}
	return false;
	}

	// Check for either a dependence (latency) or resource (hazard) stall.
	//
	// Note: The ScheduleHazardRecognizer interface requires a non-const SU.
	static bool BUHasStall(SUnit SU, int Height, RegReductionPQBase SPQ) {
	if ((int)SPQ->getCurCycle() < Height) return true;
	if (SPQ->getHazardRec()->getHazardType(SU, 0)
	!= ScheduleHazardRecognizer::NoHazard)
	return true;
	return false;
	}

	// Return -1 if left has higher priority, 1 if right has higher priority.
	// Return 0 if latency-based priority is equivalent.
	static int BUCompareLatency(SUnit left, SUnit right, bool checkPref,
	RegReductionPQBase *SPQ) {
	// Scheduling an instruction that uses a VReg whose postincrement has not yet
	// been scheduled will induce a copy. Model this as an extra cycle of latency.
	int LPenalty = hasVRegCycleUse(left) ? 1 : 0;
	int RPenalty = hasVRegCycleUse(right) ? 1 : 0;
	int LHeight = (int)left->getHeight() + LPenalty;
	int RHeight = (int)right->getHeight() + RPenalty;

	bool LStall = (!checkPref \|\| left->SchedulingPref == Sched::ILP) &&
	BUHasStall(left, LHeight, SPQ);
	bool RStall = (!checkPref \|\| right->SchedulingPref == Sched::ILP) &&
	BUHasStall(right, RHeight, SPQ);

	// If scheduling one of the node will cause a pipeline stall, delay it.
	// If scheduling either one of the node will cause a pipeline stall, sort
	// them according to their height.
	if (LStall) {
	if (!RStall)
	return 1;
	if (LHeight != RHeight)
	return LHeight > RHeight ? 1 : -1;
	} else if (RStall)
	return -1;

	// If either node is scheduling for latency, sort them by height/depth
	// and latency.
	if (!checkPref \|\| (left->SchedulingPref == Sched::ILP \|\|
	right->SchedulingPref == Sched::ILP)) {
	// If neither instruction stalls (!LStall && !RStall) and HazardRecognizer
	// is enabled, grouping instructions by cycle, then its height is already
	// covered so only its depth matters. We also reach this point if both stall
	// but have the same height.
	if (!SPQ->getHazardRec()->isEnabled()) {
	if (LHeight != RHeight)
	return LHeight > RHeight ? 1 : -1;
	}
	int LDepth = left->getDepth() - LPenalty;
	int RDepth = right->getDepth() - RPenalty;
	if (LDepth != RDepth) {
	DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum
	<< ") depth " << LDepth << " vs SU (" << right->NodeNum
	<< ") depth " << RDepth << "\n");
	return LDepth < RDepth ? 1 : -1;
	}
	if (left->Latency != right->Latency)
	return left->Latency > right->Latency ? 1 : -1;
	}
	return 0;
	}

	static bool BURRSort(SUnit left, SUnit right, RegReductionPQBase *SPQ) {
	// Schedule physical register definitions close to their use. This is
	// motivated by microarchitectures that can fuse cmp+jump macro-ops. But as
	// long as shortening physreg live ranges is generally good, we can defer
	// creating a subtarget hook.
	if (!DisableSchedPhysRegJoin) {
	bool LHasPhysReg = left->hasPhysRegDefs;
	bool RHasPhysReg = right->hasPhysRegDefs;
	if (LHasPhysReg != RHasPhysReg) {
	#ifndef NDEBUG
	static const char *const PhysRegMsg[] = { " has no physreg",
	" defines a physreg" };
	#endif
	DEBUG(dbgs() << " SU (" << left->NodeNum << ") "
	<< PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum << ") "
	<< PhysRegMsg[RHasPhysReg] << "\n");
	return LHasPhysReg < RHasPhysReg;
	}
	}

	// Prioritize by Sethi-Ulmann number and push CopyToReg nodes down.
	unsigned LPriority = SPQ->getNodePriority(left);
	unsigned RPriority = SPQ->getNodePriority(right);

	// Be really careful about hoisting call operands above previous calls.
	// Only allows it if it would reduce register pressure.
	if (left->isCall && right->isCallOp) {
	unsigned RNumVals = right->getNode()->getNumValues();
	RPriority = (RPriority > RNumVals) ? (RPriority - RNumVals) : 0;
	}
	if (right->isCall && left->isCallOp) {
	unsigned LNumVals = left->getNode()->getNumValues();
	LPriority = (LPriority > LNumVals) ? (LPriority - LNumVals) : 0;
	}

	if (LPriority != RPriority)
	return LPriority > RPriority;

	// One or both of the nodes are calls and their sethi-ullman numbers are the
	// same, then keep source order.
	if (left->isCall \|\| right->isCall) {
	unsigned LOrder = SPQ->getNodeOrdering(left);
	unsigned ROrder = SPQ->getNodeOrdering(right);

	// Prefer an ordering where the lower the non-zero order number, the higher
	// the preference.
	if ((LOrder \|\| ROrder) && LOrder != ROrder)
	return LOrder != 0 && (LOrder < ROrder \|\| ROrder == 0);
	}

	// Try schedule def + use closer when Sethi-Ullman numbers are the same.
	// e.g.
	// t1 = op t2, c1
	// t3 = op t4, c2
	//
	// and the following instructions are both ready.
	// t2 = op c3
	// t4 = op c4
	//
	// Then schedule t2 = op first.
	// i.e.
	// t4 = op c4
	// t2 = op c3
	// t1 = op t2, c1
	// t3 = op t4, c2
	//
	// This creates more short live intervals.
	unsigned LDist = closestSucc(left);
	unsigned RDist = closestSucc(right);
	if (LDist != RDist)
	return LDist < RDist;

	// How many registers becomes live when the node is scheduled.
	unsigned LScratch = calcMaxScratches(left);
	unsigned RScratch = calcMaxScratches(right);
	if (LScratch != RScratch)
	return LScratch > RScratch;

	// Comparing latency against a call makes little sense unless the node
	// is register pressure-neutral.
	if ((left->isCall && RPriority > 0) \|\| (right->isCall && LPriority > 0))
	return (left->NodeQueueId > right->NodeQueueId);

	// Do not compare latencies when one or both of the nodes are calls.
	if (!DisableSchedCycles &&
	!(left->isCall \|\| right->isCall)) {
	int result = BUCompareLatency(left, right, false /checkPref/, SPQ);
	if (result != 0)
	return result > 0;
	}
	else {
	if (left->getHeight() != right->getHeight())
	return left->getHeight() > right->getHeight();

	if (left->getDepth() != right->getDepth())
	return left->getDepth() < right->getDepth();
	}

	assert(left->NodeQueueId && right->NodeQueueId &&
	"NodeQueueId cannot be zero");
	return (left->NodeQueueId > right->NodeQueueId);
	}

	// Bottom up
	bool bu_ls_rr_sort::operator()(SUnit left, SUnit right) const {
	if (int res = checkSpecialNodes(left, right))
	return res > 0;

	return BURRSort(left, right, SPQ);
	}

	// Source order, otherwise bottom up.
	bool src_ls_rr_sort::operator()(SUnit left, SUnit right) const {
	if (int res = checkSpecialNodes(left, right))
	return res > 0;

	unsigned LOrder = SPQ->getNodeOrdering(left);
	unsigned ROrder = SPQ->getNodeOrdering(right);

	// Prefer an ordering where the lower the non-zero order number, the higher
	// the preference.
	if ((LOrder \|\| ROrder) && LOrder != ROrder)
	return LOrder != 0 && (LOrder < ROrder \|\| ROrder == 0);

	return BURRSort(left, right, SPQ);
	}

	// If the time between now and when the instruction will be ready can cover
	// the spill code, then avoid adding it to the ready queue. This gives long
	// stalls highest priority and allows hoisting across calls. It should also
	// speed up processing the available queue.
	bool hybrid_ls_rr_sort::isReady(SUnit *SU, unsigned CurCycle) const {
	static const unsigned ReadyDelay = 3;

	if (SPQ->MayReduceRegPressure(SU)) return true;

	if (SU->getHeight() > (CurCycle + ReadyDelay)) return false;

	if (SPQ->getHazardRec()->getHazardType(SU, -ReadyDelay)
	!= ScheduleHazardRecognizer::NoHazard)
	return false;

	return true;
	}

	// Return true if right should be scheduled with higher priority than left.
	bool hybrid_ls_rr_sort::operator()(SUnit left, SUnit right) const {
	if (int res = checkSpecialNodes(left, right))
	return res > 0;

	if (left->isCall \|\| right->isCall)
	// No way to compute latency of calls.
	return BURRSort(left, right, SPQ);

	bool LHigh = SPQ->HighRegPressure(left);
	bool RHigh = SPQ->HighRegPressure(right);
	// Avoid causing spills. If register pressure is high, schedule for
	// register pressure reduction.
	if (LHigh && !RHigh) {
	DEBUG(dbgs() << " pressure SU(" << left->NodeNum << ") > SU("
	<< right->NodeNum << ")\n");
	return true;
	}
	else if (!LHigh && RHigh) {
	DEBUG(dbgs() << " pressure SU(" << right->NodeNum << ") > SU("
	<< left->NodeNum << ")\n");
	return false;
	}
	if (!LHigh && !RHigh) {
	int result = BUCompareLatency(left, right, true /checkPref/, SPQ);
	if (result != 0)
	return result > 0;
	}
	return BURRSort(left, right, SPQ);
	}

	// Schedule as many instructions in each cycle as possible. So don't make an
	// instruction available unless it is ready in the current cycle.
	bool ilp_ls_rr_sort::isReady(SUnit *SU, unsigned CurCycle) const {
	if (SU->getHeight() > CurCycle) return false;

	if (SPQ->getHazardRec()->getHazardType(SU, 0)
	!= ScheduleHazardRecognizer::NoHazard)
	return false;

	return true;
	}

	static bool canEnableCoalescing(SUnit *SU) {
	unsigned Opc = SU->getNode() ? SU->getNode()->getOpcode() : 0;
	if (Opc == ISD::TokenFactor \|\| Opc == ISD::CopyToReg)
	// CopyToReg should be close to its uses to facilitate coalescing and
	// avoid spilling.
	return true;

	if (Opc == TargetOpcode::EXTRACT_SUBREG \|\|
	Opc == TargetOpcode::SUBREG_TO_REG \|\|
	Opc == TargetOpcode::INSERT_SUBREG)
	// EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG nodes should be
	// close to their uses to facilitate coalescing.
	return true;

	if (SU->NumPreds == 0 && SU->NumSuccs != 0)
	// If SU does not have a register def, schedule it close to its uses
	// because it does not lengthen any live ranges.
	return true;

	return false;
	}

	// list-ilp is currently an experimental scheduler that allows various
	// heuristics to be enabled prior to the normal register reduction logic.
	bool ilp_ls_rr_sort::operator()(SUnit left, SUnit right) const {
	if (int res = checkSpecialNodes(left, right))
	return res > 0;

	if (left->isCall \|\| right->isCall)
	// No way to compute latency of calls.
	return BURRSort(left, right, SPQ);

	unsigned LLiveUses = 0, RLiveUses = 0;
	int LPDiff = 0, RPDiff = 0;
	if (!DisableSchedRegPressure \|\| !DisableSchedLiveUses) {
	LPDiff = SPQ->RegPressureDiff(left, LLiveUses);
	RPDiff = SPQ->RegPressureDiff(right, RLiveUses);
	}
	if (!DisableSchedRegPressure && LPDiff != RPDiff) {
	DEBUG(dbgs() << "RegPressureDiff SU(" << left->NodeNum << "): " << LPDiff
	<< " != SU(" << right->NodeNum << "): " << RPDiff << "\n");
	return LPDiff > RPDiff;
	}

	if (!DisableSchedRegPressure && (LPDiff > 0 \|\| RPDiff > 0)) {
	bool LReduce = canEnableCoalescing(left);
	bool RReduce = canEnableCoalescing(right);
	if (LReduce && !RReduce) return false;
	if (RReduce && !LReduce) return true;
	}

	if (!DisableSchedLiveUses && (LLiveUses != RLiveUses)) {
	DEBUG(dbgs() << "Live uses SU(" << left->NodeNum << "): " << LLiveUses
	<< " != SU(" << right->NodeNum << "): " << RLiveUses << "\n");
	return LLiveUses < RLiveUses;
	}

	if (!DisableSchedStalls) {
	bool LStall = BUHasStall(left, left->getHeight(), SPQ);
	bool RStall = BUHasStall(right, right->getHeight(), SPQ);
	if (LStall != RStall)
	return left->getHeight() > right->getHeight();
	}

	if (!DisableSchedCriticalPath) {
	int spread = (int)left->getDepth() - (int)right->getDepth();
	if (std::abs(spread) > MaxReorderWindow) {
	DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
	<< left->getDepth() << " != SU(" << right->NodeNum << "): "
	<< right->getDepth() << "\n");
	return left->getDepth() < right->getDepth();
	}
	}

	if (!DisableSchedHeight && left->getHeight() != right->getHeight()) {
	int spread = (int)left->getHeight() - (int)right->getHeight();
	if (std::abs(spread) > MaxReorderWindow)
	return left->getHeight() > right->getHeight();
	}

	return BURRSort(left, right, SPQ);
	}

	void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) {
	SUnits = &sunits;
	// Add pseudo dependency edges for two-address nodes.
	if (!Disable2AddrHack)
	AddPseudoTwoAddrDeps();
	// Reroute edges to nodes with multiple uses.
	if (!TracksRegPressure && !SrcOrder)
	PrescheduleNodesWithMultipleUses();
	// Calculate node priorities.
	CalculateSethiUllmanNumbers();

	// For single block loops, mark nodes that look like canonical IV increments.
	if (scheduleDAG->BB->isSuccessor(scheduleDAG->BB))
	for (SUnit &SU : sunits)
	initVRegCycle(&SU);
	}

	//===----------------------------------------------------------------------===//
	// Preschedule for Register Pressure
	//===----------------------------------------------------------------------===//

	bool RegReductionPQBase::canClobber(const SUnit SU, const SUnit Op) {
	if (SU->isTwoAddress) {
	unsigned Opc = SU->getNode()->getMachineOpcode();
	const MCInstrDesc &MCID = TII->get(Opc);
	unsigned NumRes = MCID.getNumDefs();
	unsigned NumOps = MCID.getNumOperands() - NumRes;
	for (unsigned i = 0; i != NumOps; ++i) {
	if (MCID.getOperandConstraint(i+NumRes, MCOI::TIED_TO) != -1) {
	SDNode *DU = SU->getNode()->getOperand(i).getNode();
	if (DU->getNodeId() != -1 &&
	Op->OrigNode == &(*SUnits)[DU->getNodeId()])
	return true;
	}
	}
	}
	return false;
	}

	/// canClobberReachingPhysRegUse - True if SU would clobber one of it's
	/// successor's explicit physregs whose definition can reach DepSU.
	/// i.e. DepSU should not be scheduled above SU.
	static bool canClobberReachingPhysRegUse(const SUnit DepSU, const SUnit SU,
	ScheduleDAGRRList *scheduleDAG,
	const TargetInstrInfo *TII,
	const TargetRegisterInfo *TRI) {
	const MCPhysReg *ImpDefs
	= TII->get(SU->getNode()->getMachineOpcode()).getImplicitDefs();
	const uint32_t *RegMask = getNodeRegMask(SU->getNode());
	if(!ImpDefs && !RegMask)
	return false;

	for (const SDep &Succ : SU->Succs) {
	SUnit *SuccSU = Succ.getSUnit();
	for (const SDep &SuccPred : SuccSU->Preds) {
	if (!SuccPred.isAssignedRegDep())
	continue;

	if (RegMask &&
	MachineOperand::clobbersPhysReg(RegMask, SuccPred.getReg()) &&
	scheduleDAG->IsReachable(DepSU, SuccPred.getSUnit()))
	return true;

	if (ImpDefs)
	for (const MCPhysReg ImpDef = ImpDefs; ImpDef; ++ImpDef)
	// Return true if SU clobbers this physical register use and the
	// definition of the register reaches from DepSU. IsReachable queries
	// a topological forward sort of the DAG (following the successors).
	if (TRI->regsOverlap(*ImpDef, SuccPred.getReg()) &&
	scheduleDAG->IsReachable(DepSU, SuccPred.getSUnit()))
	return true;
	}
	}
	return false;
	}

	/// canClobberPhysRegDefs - True if SU would clobber one of SuccSU's
	/// physical register defs.
	static bool canClobberPhysRegDefs(const SUnit SuccSU, const SUnit SU,
	const TargetInstrInfo *TII,
	const TargetRegisterInfo *TRI) {
	SDNode *N = SuccSU->getNode();
	unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
	const MCPhysReg *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs();
	assert(ImpDefs && "Caller should check hasPhysRegDefs");
	for (const SDNode *SUNode = SU->getNode(); SUNode;
	SUNode = SUNode->getGluedNode()) {
	if (!SUNode->isMachineOpcode())
	continue;
	const MCPhysReg *SUImpDefs =
	TII->get(SUNode->getMachineOpcode()).getImplicitDefs();
	const uint32_t *SURegMask = getNodeRegMask(SUNode);
	if (!SUImpDefs && !SURegMask)
	continue;
	for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
	MVT VT = N->getSimpleValueType(i);
	if (VT == MVT::Glue \|\| VT == MVT::Other)
	continue;
	if (!N->hasAnyUseOfValue(i))
	continue;
	unsigned Reg = ImpDefs[i - NumDefs];
	if (SURegMask && MachineOperand::clobbersPhysReg(SURegMask, Reg))
	return true;
	if (!SUImpDefs)
	continue;
	for (;*SUImpDefs; ++SUImpDefs) {
	unsigned SUReg = *SUImpDefs;
	if (TRI->regsOverlap(Reg, SUReg))
	return true;
	}
	}
	}
	return false;
	}

	/// PrescheduleNodesWithMultipleUses - Nodes with multiple uses
	/// are not handled well by the general register pressure reduction
	/// heuristics. When presented with code like this:
	///
	/// N
	/// / \|
	/// / \|
	/// U store
	/// \|
	/// ...
	///
	/// the heuristics tend to push the store up, but since the
	/// operand of the store has another use (U), this would increase
	/// the length of that other use (the U->N edge).
	///
	/// This function transforms code like the above to route U's
	/// dependence through the store when possible, like this:
	///
	/// N
	/// \|\|
	/// \|\|
	/// store
	/// \|
	/// U
	/// \|
	/// ...
	///
	/// This results in the store being scheduled immediately
	/// after N, which shortens the U->N live range, reducing
	/// register pressure.
	void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
	// Visit all the nodes in topological order, working top-down.
	for (SUnit &SU : *SUnits) {
	// For now, only look at nodes with no data successors, such as stores.
	// These are especially important, due to the heuristics in
	// getNodePriority for nodes with no data successors.
	if (SU.NumSuccs != 0)
	continue;
	// For now, only look at nodes with exactly one data predecessor.
	if (SU.NumPreds != 1)
	continue;
	// Avoid prescheduling copies to virtual registers, which don't behave
	// like other nodes from the perspective of scheduling heuristics.
	if (SDNode *N = SU.getNode())
	if (N->getOpcode() == ISD::CopyToReg &&
	TargetRegisterInfo::isVirtualRegister
	(cast<RegisterSDNode>(N->getOperand(1))->getReg()))
	continue;

	// Locate the single data predecessor.
	SUnit *PredSU = nullptr;
	for (const SDep &Pred : SU.Preds)
	if (!Pred.isCtrl()) {
	PredSU = Pred.getSUnit();
	break;
	}
	assert(PredSU);

	// Don't rewrite edges that carry physregs, because that requires additional
	// support infrastructure.
	if (PredSU->hasPhysRegDefs)
	continue;
	// Short-circuit the case where SU is PredSU's only data successor.
	if (PredSU->NumSuccs == 1)
	continue;
	// Avoid prescheduling to copies from virtual registers, which don't behave
	// like other nodes from the perspective of scheduling heuristics.
	if (SDNode *N = SU.getNode())
	if (N->getOpcode() == ISD::CopyFromReg &&
	TargetRegisterInfo::isVirtualRegister
	(cast<RegisterSDNode>(N->getOperand(1))->getReg()))
	continue;

	// Perform checks on the successors of PredSU.
	for (const SDep &PredSucc : PredSU->Succs) {
	SUnit *PredSuccSU = PredSucc.getSUnit();
	if (PredSuccSU == &SU) continue;
	// If PredSU has another successor with no data successors, for
	// now don't attempt to choose either over the other.
	if (PredSuccSU->NumSuccs == 0)
	goto outer_loop_continue;
	// Don't break physical register dependencies.
	if (SU.hasPhysRegClobbers && PredSuccSU->hasPhysRegDefs)
	if (canClobberPhysRegDefs(PredSuccSU, &SU, TII, TRI))
	goto outer_loop_continue;
	// Don't introduce graph cycles.
	if (scheduleDAG->IsReachable(&SU, PredSuccSU))
	goto outer_loop_continue;
	}

	// Ok, the transformation is safe and the heuristics suggest it is
	// profitable. Update the graph.
	DEBUG(dbgs() << " Prescheduling SU #" << SU.NodeNum
	<< " next to PredSU #" << PredSU->NodeNum
	<< " to guide scheduling in the presence of multiple uses\n");
	for (unsigned i = 0; i != PredSU->Succs.size(); ++i) {
	SDep Edge = PredSU->Succs[i];
	assert(!Edge.isAssignedRegDep());
	SUnit *SuccSU = Edge.getSUnit();
	if (SuccSU != &SU) {
	Edge.setSUnit(PredSU);
	scheduleDAG->RemovePred(SuccSU, Edge);
	scheduleDAG->AddPred(&SU, Edge);
	Edge.setSUnit(&SU);
	scheduleDAG->AddPred(SuccSU, Edge);
	--i;
	}
	}
	outer_loop_continue:;
	}
	}

	/// AddPseudoTwoAddrDeps - If two nodes share an operand and one of them uses
	/// it as a def&use operand. Add a pseudo control edge from it to the other
	/// node (if it won't create a cycle) so the two-address one will be scheduled
	/// first (lower in the schedule). If both nodes are two-address, favor the
	/// one that has a CopyToReg use (more likely to be a loop induction update).
	/// If both are two-address, but one is commutable while the other is not
	/// commutable, favor the one that's not commutable.
	void RegReductionPQBase::AddPseudoTwoAddrDeps() {
	for (SUnit &SU : *SUnits) {
	if (!SU.isTwoAddress)
	continue;

	SDNode *Node = SU.getNode();
	if (!Node \|\| !Node->isMachineOpcode() \|\| SU.getNode()->getGluedNode())
	continue;

	bool isLiveOut = hasOnlyLiveOutUses(&SU);
	unsigned Opc = Node->getMachineOpcode();
	const MCInstrDesc &MCID = TII->get(Opc);
	unsigned NumRes = MCID.getNumDefs();
	unsigned NumOps = MCID.getNumOperands() - NumRes;
	for (unsigned j = 0; j != NumOps; ++j) {
	if (MCID.getOperandConstraint(j+NumRes, MCOI::TIED_TO) == -1)
	continue;
	SDNode *DU = SU.getNode()->getOperand(j).getNode();
	if (DU->getNodeId() == -1)
	continue;
	const SUnit DUSU = &(SUnits)[DU->getNodeId()];
	if (!DUSU)
	continue;
	for (const SDep &Succ : DUSU->Succs) {
	if (Succ.isCtrl())
	continue;
	SUnit *SuccSU = Succ.getSUnit();
	if (SuccSU == &SU)
	continue;
	// Be conservative. Ignore if nodes aren't at roughly the same
	// depth and height.
	if (SuccSU->getHeight() < SU.getHeight() &&
	(SU.getHeight() - SuccSU->getHeight()) > 1)
	continue;
	// Skip past COPY_TO_REGCLASS nodes, so that the pseudo edge
	// constrains whatever is using the copy, instead of the copy
	// itself. In the case that the copy is coalesced, this
	// preserves the intent of the pseudo two-address heurietics.
	while (SuccSU->Succs.size() == 1 &&
	SuccSU->getNode()->isMachineOpcode() &&
	SuccSU->getNode()->getMachineOpcode() ==
	TargetOpcode::COPY_TO_REGCLASS)
	SuccSU = SuccSU->Succs.front().getSUnit();
	// Don't constrain non-instruction nodes.
	if (!SuccSU->getNode() \|\| !SuccSU->getNode()->isMachineOpcode())
	continue;
	// Don't constrain nodes with physical register defs if the
	// predecessor can clobber them.
	if (SuccSU->hasPhysRegDefs && SU.hasPhysRegClobbers) {
	if (canClobberPhysRegDefs(SuccSU, &SU, TII, TRI))
	continue;
	}
	// Don't constrain EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG;
	// these may be coalesced away. We want them close to their uses.
	unsigned SuccOpc = SuccSU->getNode()->getMachineOpcode();
	if (SuccOpc == TargetOpcode::EXTRACT_SUBREG \|\|
	SuccOpc == TargetOpcode::INSERT_SUBREG \|\|
	SuccOpc == TargetOpcode::SUBREG_TO_REG)
	continue;
	if (!canClobberReachingPhysRegUse(SuccSU, &SU, scheduleDAG, TII, TRI) &&
	(!canClobber(SuccSU, DUSU) \|\|
	(isLiveOut && !hasOnlyLiveOutUses(SuccSU)) \|\|
	(!SU.isCommutable && SuccSU->isCommutable)) &&
	!scheduleDAG->IsReachable(SuccSU, &SU)) {
	DEBUG(dbgs() << " Adding a pseudo-two-addr edge from SU #"
	<< SU.NodeNum << " to SU #" << SuccSU->NodeNum << "\n");
	scheduleDAG->AddPred(&SU, SDep(SuccSU, SDep::Artificial));
	}
	}
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// Public Constructor Functions
	//===----------------------------------------------------------------------===//

	ScheduleDAGSDNodes *
	llvm::createBURRListDAGScheduler(SelectionDAGISel *IS,
	CodeGenOpt::Level OptLevel) {
	const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
	const TargetInstrInfo *TII = STI.getInstrInfo();
	const TargetRegisterInfo *TRI = STI.getRegisterInfo();

	BURegReductionPriorityQueue *PQ =
	new BURegReductionPriorityQueue(*IS->MF, false, false, TII, TRI, nullptr);
	ScheduleDAGRRList SD = new ScheduleDAGRRList(IS->MF, false, PQ, OptLevel);
	PQ->setScheduleDAG(SD);
	return SD;
	}

	ScheduleDAGSDNodes *
	llvm::createSourceListDAGScheduler(SelectionDAGISel *IS,
	CodeGenOpt::Level OptLevel) {
	const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
	const TargetInstrInfo *TII = STI.getInstrInfo();
	const TargetRegisterInfo *TRI = STI.getRegisterInfo();

	SrcRegReductionPriorityQueue *PQ =
	new SrcRegReductionPriorityQueue(*IS->MF, false, true, TII, TRI, nullptr);
	ScheduleDAGRRList SD = new ScheduleDAGRRList(IS->MF, false, PQ, OptLevel);
	PQ->setScheduleDAG(SD);
	return SD;
	}

	ScheduleDAGSDNodes *
	llvm::createHybridListDAGScheduler(SelectionDAGISel *IS,
	CodeGenOpt::Level OptLevel) {
	const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
	const TargetInstrInfo *TII = STI.getInstrInfo();
	const TargetRegisterInfo *TRI = STI.getRegisterInfo();
	const TargetLowering *TLI = IS->TLI;

	HybridBURRPriorityQueue *PQ =
	new HybridBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);

	ScheduleDAGRRList SD = new ScheduleDAGRRList(IS->MF, true, PQ, OptLevel);
	PQ->setScheduleDAG(SD);
	return SD;
	}

	ScheduleDAGSDNodes *
	llvm::createILPListDAGScheduler(SelectionDAGISel *IS,
	CodeGenOpt::Level OptLevel) {
	const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
	const TargetInstrInfo *TII = STI.getInstrInfo();
	const TargetRegisterInfo *TRI = STI.getRegisterInfo();
	const TargetLowering *TLI = IS->TLI;

	ILPBURRPriorityQueue *PQ =
	new ILPBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
	ScheduleDAGRRList SD = new ScheduleDAGRRList(IS->MF, true, PQ, OptLevel);
	PQ->setScheduleDAG(SD);
	return SD;
	}
	Index: head/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp (revision 328816)
	+++ head/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp (revision 328817)
	@@ -1,1151 +1,1154 @@
	//===- TargetPassConfig.cpp - Target independent code generation passes ---===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines interfaces to access the target independent code
	// generation passes provided by the LLVM backend.
	//
	//===---------------------------------------------------------------------===//

	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/Analysis/BasicAliasAnalysis.h"
	#include "llvm/Analysis/CFLAndersAliasAnalysis.h"
	#include "llvm/Analysis/CFLSteensAliasAnalysis.h"
	#include "llvm/Analysis/CallGraphSCCPass.h"
	#include "llvm/Analysis/ScopedNoAliasAA.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachinePassRegistry.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/CodeGen/RegAllocRegistry.h"
	#include "llvm/IR/IRPrintingPasses.h"
	#include "llvm/IR/LegacyPassManager.h"
	#include "llvm/IR/Verifier.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCTargetOptions.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/Threading.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Transforms/Scalar.h"
	#include "llvm/Transforms/Utils/SymbolRewriter.h"
	#include <cassert>
	#include <string>

	using namespace llvm;

	cl::opt<bool> EnableIPRA("enable-ipra", cl::init(false), cl::Hidden,
	cl::desc("Enable interprocedural register allocation "
	"to reduce load/store at procedure calls."));
	static cl::opt<bool> DisablePostRASched("disable-post-ra", cl::Hidden,
	cl::desc("Disable Post Regalloc Scheduler"));
	static cl::opt<bool> DisableBranchFold("disable-branch-fold", cl::Hidden,
	cl::desc("Disable branch folding"));
	static cl::opt<bool> DisableTailDuplicate("disable-tail-duplicate", cl::Hidden,
	cl::desc("Disable tail duplication"));
	static cl::opt<bool> DisableEarlyTailDup("disable-early-taildup", cl::Hidden,
	cl::desc("Disable pre-register allocation tail duplication"));
	static cl::opt<bool> DisableBlockPlacement("disable-block-placement",
	cl::Hidden, cl::desc("Disable probability-driven block placement"));
	static cl::opt<bool> EnableBlockPlacementStats("enable-block-placement-stats",
	cl::Hidden, cl::desc("Collect probability-driven block placement stats"));
	static cl::opt<bool> DisableSSC("disable-ssc", cl::Hidden,
	cl::desc("Disable Stack Slot Coloring"));
	static cl::opt<bool> DisableMachineDCE("disable-machine-dce", cl::Hidden,
	cl::desc("Disable Machine Dead Code Elimination"));
	static cl::opt<bool> DisableEarlyIfConversion("disable-early-ifcvt", cl::Hidden,
	cl::desc("Disable Early If-conversion"));
	static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden,
	cl::desc("Disable Machine LICM"));
	static cl::opt<bool> DisableMachineCSE("disable-machine-cse", cl::Hidden,
	cl::desc("Disable Machine Common Subexpression Elimination"));
	static cl::opt<cl::boolOrDefault> OptimizeRegAlloc(
	"optimize-regalloc", cl::Hidden,
	cl::desc("Enable optimized register allocation compilation path."));
	static cl::opt<bool> DisablePostRAMachineLICM("disable-postra-machine-licm",
	cl::Hidden,
	cl::desc("Disable Machine LICM"));
	static cl::opt<bool> DisableMachineSink("disable-machine-sink", cl::Hidden,
	cl::desc("Disable Machine Sinking"));
	static cl::opt<bool> DisableLSR("disable-lsr", cl::Hidden,
	cl::desc("Disable Loop Strength Reduction Pass"));
	static cl::opt<bool> DisableConstantHoisting("disable-constant-hoisting",
	cl::Hidden, cl::desc("Disable ConstantHoisting"));
	static cl::opt<bool> DisableCGP("disable-cgp", cl::Hidden,
	cl::desc("Disable Codegen Prepare"));
	static cl::opt<bool> DisableCopyProp("disable-copyprop", cl::Hidden,
	cl::desc("Disable Copy Propagation pass"));
	static cl::opt<bool> DisablePartialLibcallInlining("disable-partial-libcall-inlining",
	cl::Hidden, cl::desc("Disable Partial Libcall Inlining"));
	static cl::opt<bool> EnableImplicitNullChecks(
	"enable-implicit-null-checks",
	cl::desc("Fold null checks into faulting memory operations"),
	cl::init(false), cl::Hidden);
	static cl::opt<bool>
	EnableMergeICmps("enable-mergeicmps",
	cl::desc("Merge ICmp chains into a single memcmp"),
	cl::init(false), cl::Hidden);
	static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
	cl::desc("Print LLVM IR produced by the loop-reduce pass"));
	static cl::opt<bool> PrintISelInput("print-isel-input", cl::Hidden,
	cl::desc("Print LLVM IR input to isel pass"));
	static cl::opt<bool> PrintGCInfo("print-gc", cl::Hidden,
	cl::desc("Dump garbage collector data"));
	static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
	cl::desc("Verify generated machine code"),
	cl::init(false),
	cl::ZeroOrMore);
	static cl::opt<bool> EnableMachineOutliner("enable-machine-outliner",
	cl::Hidden,
	cl::desc("Enable machine outliner"));
	static cl::opt<bool> EnableLinkOnceODROutlining(
	"enable-linkonceodr-outlining",
	cl::Hidden,
	cl::desc("Enable the machine outliner on linkonceodr functions"),
	cl::init(false));
	// Enable or disable FastISel. Both options are needed, because
	// FastISel is enabled by default with -fast, and we wish to be
	// able to enable or disable fast-isel independently from -O0.
	static cl::opt<cl::boolOrDefault>
	EnableFastISelOption("fast-isel", cl::Hidden,
	cl::desc("Enable the \"fast\" instruction selector"));

	static cl::opt<cl::boolOrDefault>
	EnableGlobalISel("global-isel", cl::Hidden,
	cl::desc("Enable the \"global\" instruction selector"));

	static cl::opt<std::string> PrintMachineInstrs(
	"print-machineinstrs", cl::ValueOptional, cl::desc("Print machine instrs"),
	cl::value_desc("pass-name"), cl::init("option-unspecified"), cl::Hidden);

	static cl::opt<int> EnableGlobalISelAbort(
	"global-isel-abort", cl::Hidden,
	cl::desc("Enable abort calls when \"global\" instruction selection "
	"fails to lower/select an instruction: 0 disable the abort, "
	"1 enable the abort, and "
	"2 disable the abort but emit a diagnostic on failure"),
	cl::init(1));

	// Temporary option to allow experimenting with MachineScheduler as a post-RA
	// scheduler. Targets can "properly" enable this with
	// substitutePass(&PostRASchedulerID, &PostMachineSchedulerID).
	// Targets can return true in targetSchedulesPostRAScheduling() and
	// insert a PostRA scheduling pass wherever it wants.
	cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
	cl::desc("Run MachineScheduler post regalloc (independent of preRA sched)"));

	// Experimental option to run live interval analysis early.
	static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
	cl::desc("Run live interval analysis earlier in the pipeline"));

	// Experimental option to use CFL-AA in codegen
	enum class CFLAAType { None, Steensgaard, Andersen, Both };
	static cl::opt<CFLAAType> UseCFLAA(
	"use-cfl-aa-in-codegen", cl::init(CFLAAType::None), cl::Hidden,
	cl::desc("Enable the new, experimental CFL alias analysis in CodeGen"),
	cl::values(clEnumValN(CFLAAType::None, "none", "Disable CFL-AA"),
	clEnumValN(CFLAAType::Steensgaard, "steens",
	"Enable unification-based CFL-AA"),
	clEnumValN(CFLAAType::Andersen, "anders",
	"Enable inclusion-based CFL-AA"),
	clEnumValN(CFLAAType::Both, "both",
	"Enable both variants of CFL-AA")));

	/// Option names for limiting the codegen pipeline.
	/// Those are used in error reporting and we didn't want
	/// to duplicate their names all over the place.
	const char *StartAfterOptName = "start-after";
	const char *StartBeforeOptName = "start-before";
	const char *StopAfterOptName = "stop-after";
	const char *StopBeforeOptName = "stop-before";

	static cl::opt<std::string>
	StartAfterOpt(StringRef(StartAfterOptName),
	cl::desc("Resume compilation after a specific pass"),
	cl::value_desc("pass-name"), cl::init(""), cl::Hidden);

	static cl::opt<std::string>
	StartBeforeOpt(StringRef(StartBeforeOptName),
	cl::desc("Resume compilation before a specific pass"),
	cl::value_desc("pass-name"), cl::init(""), cl::Hidden);

	static cl::opt<std::string>
	StopAfterOpt(StringRef(StopAfterOptName),
	cl::desc("Stop compilation after a specific pass"),
	cl::value_desc("pass-name"), cl::init(""), cl::Hidden);

	static cl::opt<std::string>
	StopBeforeOpt(StringRef(StopBeforeOptName),
	cl::desc("Stop compilation before a specific pass"),
	cl::value_desc("pass-name"), cl::init(""), cl::Hidden);

	/// Allow standard passes to be disabled by command line options. This supports
	/// simple binary flags that either suppress the pass or do nothing.
	/// i.e. -disable-mypass=false has no effect.
	/// These should be converted to boolOrDefault in order to use applyOverride.
	static IdentifyingPassPtr applyDisable(IdentifyingPassPtr PassID,
	bool Override) {
	if (Override)
	return IdentifyingPassPtr();
	return PassID;
	}

	/// Allow standard passes to be disabled by the command line, regardless of who
	/// is adding the pass.
	///
	/// StandardID is the pass identified in the standard pass pipeline and provided
	/// to addPass(). It may be a target-specific ID in the case that the target
	/// directly adds its own pass, but in that case we harmlessly fall through.
	///
	/// TargetID is the pass that the target has configured to override StandardID.
	///
	/// StandardID may be a pseudo ID. In that case TargetID is the name of the real
	/// pass to run. This allows multiple options to control a single pass depending
	/// on where in the pipeline that pass is added.
	static IdentifyingPassPtr overridePass(AnalysisID StandardID,
	IdentifyingPassPtr TargetID) {
	if (StandardID == &PostRASchedulerID)
	return applyDisable(TargetID, DisablePostRASched);

	if (StandardID == &BranchFolderPassID)
	return applyDisable(TargetID, DisableBranchFold);

	if (StandardID == &TailDuplicateID)
	return applyDisable(TargetID, DisableTailDuplicate);

	if (StandardID == &TargetPassConfig::EarlyTailDuplicateID)
	return applyDisable(TargetID, DisableEarlyTailDup);

	if (StandardID == &MachineBlockPlacementID)
	return applyDisable(TargetID, DisableBlockPlacement);

	if (StandardID == &StackSlotColoringID)
	return applyDisable(TargetID, DisableSSC);

	if (StandardID == &DeadMachineInstructionElimID)
	return applyDisable(TargetID, DisableMachineDCE);

	if (StandardID == &EarlyIfConverterID)
	return applyDisable(TargetID, DisableEarlyIfConversion);

	if (StandardID == &MachineLICMID)
	return applyDisable(TargetID, DisableMachineLICM);

	if (StandardID == &MachineCSEID)
	return applyDisable(TargetID, DisableMachineCSE);

	if (StandardID == &TargetPassConfig::PostRAMachineLICMID)
	return applyDisable(TargetID, DisablePostRAMachineLICM);

	if (StandardID == &MachineSinkingID)
	return applyDisable(TargetID, DisableMachineSink);

	if (StandardID == &MachineCopyPropagationID)
	return applyDisable(TargetID, DisableCopyProp);

	return TargetID;
	}

	//===---------------------------------------------------------------------===//
	/// TargetPassConfig
	//===---------------------------------------------------------------------===//

	INITIALIZE_PASS(TargetPassConfig, "targetpassconfig",
	"Target Pass Configuration", false, false)
	char TargetPassConfig::ID = 0;

	// Pseudo Pass IDs.
	char TargetPassConfig::EarlyTailDuplicateID = 0;
	char TargetPassConfig::PostRAMachineLICMID = 0;

	namespace {

	struct InsertedPass {
	AnalysisID TargetPassID;
	IdentifyingPassPtr InsertedPassID;
	bool VerifyAfter;
	bool PrintAfter;

	InsertedPass(AnalysisID TargetPassID, IdentifyingPassPtr InsertedPassID,
	bool VerifyAfter, bool PrintAfter)
	: TargetPassID(TargetPassID), InsertedPassID(InsertedPassID),
	VerifyAfter(VerifyAfter), PrintAfter(PrintAfter) {}

	Pass *getInsertedPass() const {
	assert(InsertedPassID.isValid() && "Illegal Pass ID!");
	if (InsertedPassID.isInstance())
	return InsertedPassID.getInstance();
	Pass *NP = Pass::createPass(InsertedPassID.getID());
	assert(NP && "Pass ID not registered");
	return NP;
	}
	};

	} // end anonymous namespace

	namespace llvm {

	class PassConfigImpl {
	public:
	// List of passes explicitly substituted by this target. Normally this is
	// empty, but it is a convenient way to suppress or replace specific passes
	// that are part of a standard pass pipeline without overridding the entire
	// pipeline. This mechanism allows target options to inherit a standard pass's
	// user interface. For example, a target may disable a standard pass by
	// default by substituting a pass ID of zero, and the user may still enable
	// that standard pass with an explicit command line option.
	DenseMap<AnalysisID,IdentifyingPassPtr> TargetPasses;

	/// Store the pairs of <AnalysisID, AnalysisID> of which the second pass
	/// is inserted after each instance of the first one.
	SmallVector<InsertedPass, 4> InsertedPasses;
	};

	} // end namespace llvm

	// Out of line virtual method.
	TargetPassConfig::~TargetPassConfig() {
	delete Impl;
	}

	static const PassInfo *getPassInfo(StringRef PassName) {
	if (PassName.empty())
	return nullptr;

	const PassRegistry &PR = *PassRegistry::getPassRegistry();
	const PassInfo *PI = PR.getPassInfo(PassName);
	if (!PI)
	report_fatal_error(Twine('\"') + Twine(PassName) +
	Twine("\" pass is not registered."));
	return PI;
	}

	static AnalysisID getPassIDFromName(StringRef PassName) {
	const PassInfo *PI = getPassInfo(PassName);
	return PI ? PI->getTypeInfo() : nullptr;
	}

	void TargetPassConfig::setStartStopPasses() {
	StartBefore = getPassIDFromName(StartBeforeOpt);
	StartAfter = getPassIDFromName(StartAfterOpt);
	StopBefore = getPassIDFromName(StopBeforeOpt);
	StopAfter = getPassIDFromName(StopAfterOpt);
	if (StartBefore && StartAfter)
	report_fatal_error(Twine(StartBeforeOptName) + Twine(" and ") +
	Twine(StartAfterOptName) + Twine(" specified!"));
	if (StopBefore && StopAfter)
	report_fatal_error(Twine(StopBeforeOptName) + Twine(" and ") +
	Twine(StopAfterOptName) + Twine(" specified!"));
	Started = (StartAfter == nullptr) && (StartBefore == nullptr);
	}

	// Out of line constructor provides default values for pass options and
	// registers all common codegen passes.
	TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm)
	: ImmutablePass(ID), PM(&pm), TM(&TM) {
	Impl = new PassConfigImpl();

	// Register all target independent codegen passes to activate their PassIDs,
	// including this pass itself.
	initializeCodeGen(*PassRegistry::getPassRegistry());

	// Also register alias analysis passes required by codegen passes.
	initializeBasicAAWrapperPassPass(*PassRegistry::getPassRegistry());
	initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry());

	// Substitute Pseudo Pass IDs for real ones.
	substitutePass(&EarlyTailDuplicateID, &TailDuplicateID);
	substitutePass(&PostRAMachineLICMID, &MachineLICMID);

	if (StringRef(PrintMachineInstrs.getValue()).equals(""))
	TM.Options.PrintMachineCode = true;

	if (EnableIPRA.getNumOccurrences())
	TM.Options.EnableIPRA = EnableIPRA;
	else {
	// If not explicitly specified, use target default.
	TM.Options.EnableIPRA = TM.useIPRA();
	}

	if (TM.Options.EnableIPRA)
	setRequiresCodeGenSCCOrder();

	setStartStopPasses();
	}

	CodeGenOpt::Level TargetPassConfig::getOptLevel() const {
	return TM->getOptLevel();
	}

	/// Insert InsertedPassID pass after TargetPassID.
	void TargetPassConfig::insertPass(AnalysisID TargetPassID,
	IdentifyingPassPtr InsertedPassID,
	bool VerifyAfter, bool PrintAfter) {
	assert(((!InsertedPassID.isInstance() &&
	TargetPassID != InsertedPassID.getID()) \|\|
	(InsertedPassID.isInstance() &&
	TargetPassID != InsertedPassID.getInstance()->getPassID())) &&
	"Insert a pass after itself!");
	Impl->InsertedPasses.emplace_back(TargetPassID, InsertedPassID, VerifyAfter,
	PrintAfter);
	}

	/// createPassConfig - Create a pass configuration object to be used by
	/// addPassToEmitX methods for generating a pipeline of CodeGen passes.
	///
	/// Targets may override this to extend TargetPassConfig.
	TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) {
	return new TargetPassConfig(*this, PM);
	}

	TargetPassConfig::TargetPassConfig()
	: ImmutablePass(ID) {
	report_fatal_error("Trying to construct TargetPassConfig without a target "
	"machine. Scheduling a CodeGen pass without a target "
	"triple set?");
	}

	bool TargetPassConfig::hasLimitedCodeGenPipeline() const {
	return StartBefore \|\| StartAfter \|\| StopBefore \|\| StopAfter;
	}

	std::string
	TargetPassConfig::getLimitedCodeGenPipelineReason(const char *Separator) const {
	if (!hasLimitedCodeGenPipeline())
	return std::string();
	std::string Res;
	static cl::opt<std::string> *PassNames[] = {&StartAfterOpt, &StartBeforeOpt,
	&StopAfterOpt, &StopBeforeOpt};
	static const char *OptNames[] = {StartAfterOptName, StartBeforeOptName,
	StopAfterOptName, StopBeforeOptName};
	bool IsFirst = true;
	for (int Idx = 0; Idx < 4; ++Idx)
	if (!PassNames[Idx]->empty()) {
	if (!IsFirst)
	Res += Separator;
	IsFirst = false;
	Res += OptNames[Idx];
	}
	return Res;
	}

	// Helper to verify the analysis is really immutable.
	void TargetPassConfig::setOpt(bool &Opt, bool Val) {
	assert(!Initialized && "PassConfig is immutable");
	Opt = Val;
	}

	void TargetPassConfig::substitutePass(AnalysisID StandardID,
	IdentifyingPassPtr TargetID) {
	Impl->TargetPasses[StandardID] = TargetID;
	}

	IdentifyingPassPtr TargetPassConfig::getPassSubstitution(AnalysisID ID) const {
	DenseMap<AnalysisID, IdentifyingPassPtr>::const_iterator
	I = Impl->TargetPasses.find(ID);
	if (I == Impl->TargetPasses.end())
	return ID;
	return I->second;
	}

	bool TargetPassConfig::isPassSubstitutedOrOverridden(AnalysisID ID) const {
	IdentifyingPassPtr TargetID = getPassSubstitution(ID);
	IdentifyingPassPtr FinalPtr = overridePass(ID, TargetID);
	return !FinalPtr.isValid() \|\| FinalPtr.isInstance() \|\|
	FinalPtr.getID() != ID;
	}

	/// Add a pass to the PassManager if that pass is supposed to be run. If the
	/// Started/Stopped flags indicate either that the compilation should start at
	/// a later pass or that it should stop after an earlier pass, then do not add
	/// the pass. Finally, compare the current pass against the StartAfter
	/// and StopAfter options and change the Started/Stopped flags accordingly.
	void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
	assert(!Initialized && "PassConfig is immutable");

	// Cache the Pass ID here in case the pass manager finds this pass is
	// redundant with ones already scheduled / available, and deletes it.
	// Fundamentally, once we add the pass to the manager, we no longer own it
	// and shouldn't reference it.
	AnalysisID PassID = P->getPassID();

	if (StartBefore == PassID)
	Started = true;
	if (StopBefore == PassID)
	Stopped = true;
	if (Started && !Stopped) {
	std::string Banner;
	// Construct banner message before PM->add() as that may delete the pass.
	if (AddingMachinePasses && (printAfter \|\| verifyAfter))
	Banner = std::string("After ") + std::string(P->getPassName());
	PM->add(P);
	if (AddingMachinePasses) {
	if (printAfter)
	addPrintPass(Banner);
	if (verifyAfter)
	addVerifyPass(Banner);
	}

	// Add the passes after the pass P if there is any.
	for (auto IP : Impl->InsertedPasses) {
	if (IP.TargetPassID == PassID)
	addPass(IP.getInsertedPass(), IP.VerifyAfter, IP.PrintAfter);
	}
	} else {
	delete P;
	}
	if (StopAfter == PassID)
	Stopped = true;
	if (StartAfter == PassID)
	Started = true;
	if (Stopped && !Started)
	report_fatal_error("Cannot stop compilation after pass that is not run");
	}

	/// Add a CodeGen pass at this point in the pipeline after checking for target
	/// and command line overrides.
	///
	/// addPass cannot return a pointer to the pass instance because is internal the
	/// PassManager and the instance we create here may already be freed.
	AnalysisID TargetPassConfig::addPass(AnalysisID PassID, bool verifyAfter,
	bool printAfter) {
	IdentifyingPassPtr TargetID = getPassSubstitution(PassID);
	IdentifyingPassPtr FinalPtr = overridePass(PassID, TargetID);
	if (!FinalPtr.isValid())
	return nullptr;

	Pass *P;
	if (FinalPtr.isInstance())
	P = FinalPtr.getInstance();
	else {
	P = Pass::createPass(FinalPtr.getID());
	if (!P)
	llvm_unreachable("Pass ID not registered");
	}
	AnalysisID FinalID = P->getPassID();
	addPass(P, verifyAfter, printAfter); // Ends the lifetime of P.

	return FinalID;
	}

	void TargetPassConfig::printAndVerify(const std::string &Banner) {
	addPrintPass(Banner);
	addVerifyPass(Banner);
	}

	void TargetPassConfig::addPrintPass(const std::string &Banner) {
	if (TM->shouldPrintMachineCode())
	PM->add(createMachineFunctionPrinterPass(dbgs(), Banner));
	}

	void TargetPassConfig::addVerifyPass(const std::string &Banner) {
	bool Verify = VerifyMachineCode;
	#ifdef EXPENSIVE_CHECKS
	if (VerifyMachineCode == cl::BOU_UNSET)
	Verify = TM->isMachineVerifierClean();
	#endif
	if (Verify)
	PM->add(createMachineVerifierPass(Banner));
	}

	/// Add common target configurable passes that perform LLVM IR to IR transforms
	/// following machine independent optimization.
	void TargetPassConfig::addIRPasses() {
	switch (UseCFLAA) {
	case CFLAAType::Steensgaard:
	addPass(createCFLSteensAAWrapperPass());
	break;
	case CFLAAType::Andersen:
	addPass(createCFLAndersAAWrapperPass());
	break;
	case CFLAAType::Both:
	addPass(createCFLAndersAAWrapperPass());
	addPass(createCFLSteensAAWrapperPass());
	break;
	default:
	break;
	}

	// Basic AliasAnalysis support.
	// Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
	// BasicAliasAnalysis wins if they disagree. This is intended to help
	// support "obvious" type-punning idioms.
	addPass(createTypeBasedAAWrapperPass());
	addPass(createScopedNoAliasAAWrapperPass());
	addPass(createBasicAAWrapperPass());

	// Before running any passes, run the verifier to determine if the input
	// coming from the front-end and/or optimizer is valid.
	if (!DisableVerify)
	addPass(createVerifierPass());

	// Run loop strength reduction before anything else.
	if (getOptLevel() != CodeGenOpt::None && !DisableLSR) {
	addPass(createLoopStrengthReducePass());
	if (PrintLSR)
	addPass(createPrintFunctionPass(dbgs(), "\n\n* Code after LSR *\n"));
	}

	if (getOptLevel() != CodeGenOpt::None) {
	// The MergeICmpsPass tries to create memcmp calls by grouping sequences of
	// loads and compares. ExpandMemCmpPass then tries to expand those calls
	// into optimally-sized loads and compares. The transforms are enabled by a
	// target lowering hook.
	if (EnableMergeICmps)
	addPass(createMergeICmpsPass());
	addPass(createExpandMemCmpPass());
	}

	// Run GC lowering passes for builtin collectors
	// TODO: add a pass insertion point here
	addPass(createGCLoweringPass());
	addPass(createShadowStackGCLoweringPass());

	// Make sure that no unreachable blocks are instruction selected.
	addPass(createUnreachableBlockEliminationPass());

	// Prepare expensive constants for SelectionDAG.
	if (getOptLevel() != CodeGenOpt::None && !DisableConstantHoisting)
	addPass(createConstantHoistingPass());

	if (getOptLevel() != CodeGenOpt::None && !DisablePartialLibcallInlining)
	addPass(createPartiallyInlineLibCallsPass());

	// Instrument function entry and exit, e.g. with calls to mcount().
	addPass(createPostInlineEntryExitInstrumenterPass());

	// Add scalarization of target's unsupported masked memory intrinsics pass.
	// the unsupported intrinsic will be replaced with a chain of basic blocks,
	// that stores/loads element one-by-one if the appropriate mask bit is set.
	addPass(createScalarizeMaskedMemIntrinPass());

	// Expand reduction intrinsics into shuffle sequences if the target wants to.
	addPass(createExpandReductionsPass());
	}

	/// Turn exception handling constructs into something the code generators can
	/// handle.
	void TargetPassConfig::addPassesToHandleExceptions() {
	const MCAsmInfo *MCAI = TM->getMCAsmInfo();
	assert(MCAI && "No MCAsmInfo");
	switch (MCAI->getExceptionHandlingType()) {
	case ExceptionHandling::SjLj:
	// SjLj piggy-backs on dwarf for this bit. The cleanups done apply to both
	// Dwarf EH prepare needs to be run after SjLj prepare. Otherwise,
	// catch info can get misplaced when a selector ends up more than one block
	// removed from the parent invoke(s). This could happen when a landing
	// pad is shared by multiple invokes and is also a target of a normal
	// edge from elsewhere.
	addPass(createSjLjEHPreparePass());
	LLVM_FALLTHROUGH;
	case ExceptionHandling::DwarfCFI:
	case ExceptionHandling::ARM:
	addPass(createDwarfEHPass());
	break;
	case ExceptionHandling::WinEH:
	// We support using both GCC-style and MSVC-style exceptions on Windows, so
	// add both preparation passes. Each pass will only actually run if it
	// recognizes the personality function.
	addPass(createWinEHPass());
	addPass(createDwarfEHPass());
	break;
	case ExceptionHandling::None:
	addPass(createLowerInvokePass());

	// The lower invoke pass may create unreachable code. Remove it.
	addPass(createUnreachableBlockEliminationPass());
	break;
	}
	}

	/// Add pass to prepare the LLVM IR for code generation. This should be done
	/// before exception handling preparation passes.
	void TargetPassConfig::addCodeGenPrepare() {
	if (getOptLevel() != CodeGenOpt::None && !DisableCGP)
	addPass(createCodeGenPreparePass());
	addPass(createRewriteSymbolsPass());
	}

	/// Add common passes that perform LLVM IR to IR transforms in preparation for
	/// instruction selection.
	void TargetPassConfig::addISelPrepare() {
	addPreISel();

	// Force codegen to run according to the callgraph.
	if (requiresCodeGenSCCOrder())
	addPass(new DummyCGSCCPass);

	// Add both the safe stack and the stack protection passes: each of them will
	// only protect functions that have corresponding attributes.
	addPass(createSafeStackPass());
	addPass(createStackProtectorPass());

	if (PrintISelInput)
	addPass(createPrintFunctionPass(
	dbgs(), "\n\n* Final LLVM Code input to ISel *\n"));

	// All passes which modify the LLVM IR are now complete; run the verifier
	// to ensure that the IR is valid.
	if (!DisableVerify)
	addPass(createVerifierPass());
	}

	bool TargetPassConfig::addCoreISelPasses() {
	// Enable FastISel with -fast, but allow that to be overridden.
	TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE);
	if (EnableFastISelOption == cl::BOU_TRUE \|\|
	(TM->getOptLevel() == CodeGenOpt::None && TM->getO0WantsFastISel()))
	TM->setFastISel(true);

	// Ask the target for an isel.
	// Enable GlobalISel if the target wants to, but allow that to be overriden.
	// Explicitly enabling fast-isel should override implicitly enabled
	// global-isel.
	if (EnableGlobalISel == cl::BOU_TRUE \|\|
	(EnableGlobalISel == cl::BOU_UNSET && isGlobalISelEnabled() &&
	EnableFastISelOption != cl::BOU_TRUE)) {
	TM->setFastISel(false);

	if (addIRTranslator())
	return true;

	addPreLegalizeMachineIR();

	if (addLegalizeMachineIR())
	return true;

	// Before running the register bank selector, ask the target if it
	// wants to run some passes.
	addPreRegBankSelect();

	if (addRegBankSelect())
	return true;

	addPreGlobalInstructionSelect();

	if (addGlobalInstructionSelect())
	return true;

	// Pass to reset the MachineFunction if the ISel failed.
	addPass(createResetMachineFunctionPass(
	reportDiagnosticWhenGlobalISelFallback(), isGlobalISelAbortEnabled()));

	// Provide a fallback path when we do not want to abort on
	// not-yet-supported input.
	if (!isGlobalISelAbortEnabled() && addInstSelector())
	return true;

	} else if (addInstSelector())
	return true;

	return false;
	}

	bool TargetPassConfig::addISelPasses() {
	if (TM->Options.EmulatedTLS)
	addPass(createLowerEmuTLSPass());

	addPass(createPreISelIntrinsicLoweringPass());
	addPass(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
	addIRPasses();
	addCodeGenPrepare();
	addPassesToHandleExceptions();
	addISelPrepare();

	return addCoreISelPasses();
	}

	/// -regalloc=... command line option.
	static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
	static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
	RegisterPassParser<RegisterRegAlloc>>
	RegAlloc("regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
	cl::desc("Register allocator to use"));

	/// Add the complete set of target-independent postISel code generator passes.
	///
	/// This can be read as the standard order of major LLVM CodeGen stages. Stages
	/// with nontrivial configuration or multiple passes are broken out below in
	/// add%Stage routines.
	///
	/// Any TargetPassConfig::addXX routine may be overriden by the Target. The
	/// addPre/Post methods with empty header implementations allow injecting
	/// target-specific fixups just before or after major stages. Additionally,
	/// targets have the flexibility to change pass order within a stage by
	/// overriding default implementation of add%Stage routines below. Each
	/// technique has maintainability tradeoffs because alternate pass orders are
	/// not well supported. addPre/Post works better if the target pass is easily
	/// tied to a common pass. But if it has subtle dependencies on multiple passes,
	/// the target should override the stage instead.
	///
	/// TODO: We could use a single addPre/Post(ID) hook to allow pass injection
	/// before/after any target-independent pass. But it's currently overkill.
	void TargetPassConfig::addMachinePasses() {
	AddingMachinePasses = true;

	// Insert a machine instr printer pass after the specified pass.
	if (!StringRef(PrintMachineInstrs.getValue()).equals("") &&
	!StringRef(PrintMachineInstrs.getValue()).equals("option-unspecified")) {
	const PassRegistry *PR = PassRegistry::getPassRegistry();
	const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
	const PassInfo *IPI = PR->getPassInfo(StringRef("machineinstr-printer"));
	assert (TPI && IPI && "Pass ID not registered!");
	const char TID = (const char )(TPI->getTypeInfo());
	const char IID = (const char )(IPI->getTypeInfo());
	insertPass(TID, IID);
	}

	// Print the instruction selected machine code...
	printAndVerify("After Instruction Selection");

	// Expand pseudo-instructions emitted by ISel.
	addPass(&ExpandISelPseudosID);

	// Add passes that optimize machine instructions in SSA form.
	if (getOptLevel() != CodeGenOpt::None) {
	addMachineSSAOptimization();
	} else {
	// If the target requests it, assign local variables to stack slots relative
	// to one another and simplify frame index references where possible.
	addPass(&LocalStackSlotAllocationID, false);
	}

	if (TM->Options.EnableIPRA)
	addPass(createRegUsageInfoPropPass());

	// Run pre-ra passes.
	addPreRegAlloc();

	// Run register allocation and passes that are tightly coupled with it,
	// including phi elimination and scheduling.
	if (getOptimizeRegAlloc())
	addOptimizedRegAlloc(createRegAllocPass(true));
	else {
	if (RegAlloc != &useDefaultRegisterAllocator &&
	RegAlloc != &createFastRegisterAllocator)
	report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc.");
	addFastRegAlloc(createRegAllocPass(false));
	}

	// Run post-ra passes.
	addPostRegAlloc();

	// Insert prolog/epilog code. Eliminate abstract frame index references...
	if (getOptLevel() != CodeGenOpt::None)
	addPass(&ShrinkWrapID);

	// Prolog/Epilog inserter needs a TargetMachine to instantiate. But only
	// do so if it hasn't been disabled, substituted, or overridden.
	if (!isPassSubstitutedOrOverridden(&PrologEpilogCodeInserterID))
	addPass(createPrologEpilogInserterPass());

	/// Add passes that optimize machine instructions after register allocation.
	if (getOptLevel() != CodeGenOpt::None)
	addMachineLateOptimization();

	// Expand pseudo instructions before second scheduling pass.
	addPass(&ExpandPostRAPseudosID);

	// Run pre-sched2 passes.
	addPreSched2();

	if (EnableImplicitNullChecks)
	addPass(&ImplicitNullChecksID);

	// Second pass scheduler.
	// Let Target optionally insert this pass by itself at some other
	// point.
	if (getOptLevel() != CodeGenOpt::None &&
	!TM->targetSchedulesPostRAScheduling()) {
	if (MISchedPostRA)
	addPass(&PostMachineSchedulerID);
	else
	addPass(&PostRASchedulerID);
	}

	// GC
	if (addGCPasses()) {
	if (PrintGCInfo)
	addPass(createGCInfoPrinter(dbgs()), false, false);
	}

	// Basic block placement.
	if (getOptLevel() != CodeGenOpt::None)
	addBlockPlacement();

	addPreEmitPass();

	if (TM->Options.EnableIPRA)
	// Collect register usage information and produce a register mask of
	// clobbered registers, to be used to optimize call sites.
	addPass(createRegUsageInfoCollector());

	addPass(&FuncletLayoutID, false);

	addPass(&StackMapLivenessID, false);
	addPass(&LiveDebugValuesID, false);

	// Insert before XRay Instrumentation.
	addPass(&FEntryInserterID, false);

	addPass(&XRayInstrumentationID, false);
	addPass(&PatchableFunctionID, false);

	if (EnableMachineOutliner)
	PM->add(createMachineOutlinerPass(EnableLinkOnceODROutlining));

	+ // Add passes that directly emit MI after all other MI passes.
	+ addPreEmitPass2();
	+
	AddingMachinePasses = false;
	}

	/// Add passes that optimize machine instructions in SSA form.
	void TargetPassConfig::addMachineSSAOptimization() {
	// Pre-ra tail duplication.
	addPass(&EarlyTailDuplicateID);

	// Optimize PHIs before DCE: removing dead PHI cycles may make more
	// instructions dead.
	addPass(&OptimizePHIsID, false);

	// This pass merges large allocas. StackSlotColoring is a different pass
	// which merges spill slots.
	addPass(&StackColoringID, false);

	// If the target requests it, assign local variables to stack slots relative
	// to one another and simplify frame index references where possible.
	addPass(&LocalStackSlotAllocationID, false);

	// With optimization, dead code should already be eliminated. However
	// there is one known exception: lowered code for arguments that are only
	// used by tail calls, where the tail calls reuse the incoming stack
	// arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
	addPass(&DeadMachineInstructionElimID);

	// Allow targets to insert passes that improve instruction level parallelism,
	// like if-conversion. Such passes will typically need dominator trees and
	// loop info, just like LICM and CSE below.
	addILPOpts();

	addPass(&MachineLICMID, false);
	addPass(&MachineCSEID, false);

	addPass(&MachineSinkingID);

	addPass(&PeepholeOptimizerID);
	// Clean-up the dead code that may have been generated by peephole
	// rewriting.
	addPass(&DeadMachineInstructionElimID);
	}

	//===---------------------------------------------------------------------===//
	/// Register Allocation Pass Configuration
	//===---------------------------------------------------------------------===//

	bool TargetPassConfig::getOptimizeRegAlloc() const {
	switch (OptimizeRegAlloc) {
	case cl::BOU_UNSET: return getOptLevel() != CodeGenOpt::None;
	case cl::BOU_TRUE: return true;
	case cl::BOU_FALSE: return false;
	}
	llvm_unreachable("Invalid optimize-regalloc state");
	}

	/// RegisterRegAlloc's global Registry tracks allocator registration.
	MachinePassRegistry RegisterRegAlloc::Registry;

	/// A dummy default pass factory indicates whether the register allocator is
	/// overridden on the command line.
	static llvm::once_flag InitializeDefaultRegisterAllocatorFlag;

	static RegisterRegAlloc
	defaultRegAlloc("default",
	"pick register allocator based on -O option",
	useDefaultRegisterAllocator);

	static void initializeDefaultRegisterAllocatorOnce() {
	RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();

	if (!Ctor) {
	Ctor = RegAlloc;
	RegisterRegAlloc::setDefault(RegAlloc);
	}
	}

	/// Instantiate the default register allocator pass for this target for either
	/// the optimized or unoptimized allocation path. This will be added to the pass
	/// manager by addFastRegAlloc in the unoptimized case or addOptimizedRegAlloc
	/// in the optimized case.
	///
	/// A target that uses the standard regalloc pass order for fast or optimized
	/// allocation may still override this for per-target regalloc
	/// selection. But -regalloc=... always takes precedence.
	FunctionPass *TargetPassConfig::createTargetRegisterAllocator(bool Optimized) {
	if (Optimized)
	return createGreedyRegisterAllocator();
	else
	return createFastRegisterAllocator();
	}

	/// Find and instantiate the register allocation pass requested by this target
	/// at the current optimization level. Different register allocators are
	/// defined as separate passes because they may require different analysis.
	///
	/// This helper ensures that the regalloc= option is always available,
	/// even for targets that override the default allocator.
	///
	/// FIXME: When MachinePassRegistry register pass IDs instead of function ptrs,
	/// this can be folded into addPass.
	FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) {
	// Initialize the global default.
	llvm::call_once(InitializeDefaultRegisterAllocatorFlag,
	initializeDefaultRegisterAllocatorOnce);

	RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
	if (Ctor != useDefaultRegisterAllocator)
	return Ctor();

	// With no -regalloc= override, ask the target for a regalloc pass.
	return createTargetRegisterAllocator(Optimized);
	}

	/// Return true if the default global register allocator is in use and
	/// has not be overriden on the command line with '-regalloc=...'
	bool TargetPassConfig::usingDefaultRegAlloc() const {
	return RegAlloc.getNumOccurrences() == 0;
	}

	/// Add the minimum set of target-independent passes that are required for
	/// register allocation. No coalescing or scheduling.
	void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
	addPass(&PHIEliminationID, false);
	addPass(&TwoAddressInstructionPassID, false);

	if (RegAllocPass)
	addPass(RegAllocPass);
	}

	/// Add standard target-independent passes that are tightly coupled with
	/// optimized register allocation, including coalescing, machine instruction
	/// scheduling, and register allocation itself.
	void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
	addPass(&DetectDeadLanesID, false);

	addPass(&ProcessImplicitDefsID, false);

	// LiveVariables currently requires pure SSA form.
	//
	// FIXME: Once TwoAddressInstruction pass no longer uses kill flags,
	// LiveVariables can be removed completely, and LiveIntervals can be directly
	// computed. (We still either need to regenerate kill flags after regalloc, or
	// preferably fix the scavenger to not depend on them).
	addPass(&LiveVariablesID, false);

	// Edge splitting is smarter with machine loop info.
	addPass(&MachineLoopInfoID, false);
	addPass(&PHIEliminationID, false);

	// Eventually, we want to run LiveIntervals before PHI elimination.
	if (EarlyLiveIntervals)
	addPass(&LiveIntervalsID, false);

	addPass(&TwoAddressInstructionPassID, false);
	addPass(&RegisterCoalescerID);

	// The machine scheduler may accidentally create disconnected components
	// when moving subregister definitions around, avoid this by splitting them to
	// separate vregs before. Splitting can also improve reg. allocation quality.
	addPass(&RenameIndependentSubregsID);

	// PreRA instruction scheduling.
	addPass(&MachineSchedulerID);

	if (RegAllocPass) {
	// Add the selected register allocation pass.
	addPass(RegAllocPass);

	// Allow targets to change the register assignments before rewriting.
	addPreRewrite();

	// Finally rewrite virtual registers.
	addPass(&VirtRegRewriterID);

	// Perform stack slot coloring and post-ra machine LICM.
	//
	// FIXME: Re-enable coloring with register when it's capable of adding
	// kill markers.
	addPass(&StackSlotColoringID);

	// Run post-ra machine LICM to hoist reloads / remats.
	//
	// FIXME: can this move into MachineLateOptimization?
	addPass(&PostRAMachineLICMID);
	}
	}

	//===---------------------------------------------------------------------===//
	/// Post RegAlloc Pass Configuration
	//===---------------------------------------------------------------------===//

	/// Add passes that optimize machine instructions after register allocation.
	void TargetPassConfig::addMachineLateOptimization() {
	// Branch folding must be run after regalloc and prolog/epilog insertion.
	addPass(&BranchFolderPassID);

	// Tail duplication.
	// Note that duplicating tail just increases code size and degrades
	// performance for targets that require Structured Control Flow.
	// In addition it can also make CFG irreducible. Thus we disable it.
	if (!TM->requiresStructuredCFG())
	addPass(&TailDuplicateID);

	// Copy propagation.
	addPass(&MachineCopyPropagationID);
	}

	/// Add standard GC passes.
	bool TargetPassConfig::addGCPasses() {
	addPass(&GCMachineCodeAnalysisID, false);
	return true;
	}

	/// Add standard basic block placement passes.
	void TargetPassConfig::addBlockPlacement() {
	if (addPass(&MachineBlockPlacementID)) {
	// Run a separate pass to collect block placement statistics.
	if (EnableBlockPlacementStats)
	addPass(&MachineBlockPlacementStatsID);
	}
	}

	//===---------------------------------------------------------------------===//
	/// GlobalISel Configuration
	//===---------------------------------------------------------------------===//

	bool TargetPassConfig::isGlobalISelEnabled() const {
	return false;
	}

	bool TargetPassConfig::isGlobalISelAbortEnabled() const {
	if (EnableGlobalISelAbort.getNumOccurrences() > 0)
	return EnableGlobalISelAbort == 1;

	// When no abort behaviour is specified, we don't abort if the target says
	// that GISel is enabled.
	return !isGlobalISelEnabled();
	}

	bool TargetPassConfig::reportDiagnosticWhenGlobalISelFallback() const {
	return EnableGlobalISelAbort == 2;
	}
	Index: head/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp (revision 328816)
	+++ head/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp (revision 328817)
	@@ -1,113 +1,117 @@
	//===- TargetSubtargetInfo.cpp - General Target Information ----------------==//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file This file describes the general parts of a Subtarget.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetSchedule.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/Support/Format.h"
	#include "llvm/Support/raw_ostream.h"
	#include <string>

	using namespace llvm;

	TargetSubtargetInfo::TargetSubtargetInfo(
	const Triple &TT, StringRef CPU, StringRef FS,
	ArrayRef<SubtargetFeatureKV> PF, ArrayRef<SubtargetFeatureKV> PD,
	const SubtargetInfoKV ProcSched, const MCWriteProcResEntry WPR,
	const MCWriteLatencyEntry WL, const MCReadAdvanceEntry RA,
	const InstrStage IS, const unsigned OC, const unsigned *FP)
	: MCSubtargetInfo(TT, CPU, FS, PF, PD, ProcSched, WPR, WL, RA, IS, OC, FP) {
	}

	TargetSubtargetInfo::~TargetSubtargetInfo() = default;

	bool TargetSubtargetInfo::enableAtomicExpand() const {
	return true;
	}

	+bool TargetSubtargetInfo::enableIndirectBrExpand() const {
	+ return false;
	+}
	+
	bool TargetSubtargetInfo::enableMachineScheduler() const {
	return false;
	}

	bool TargetSubtargetInfo::enableJoinGlobalCopies() const {
	return enableMachineScheduler();
	}

	bool TargetSubtargetInfo::enableRALocalReassignment(
	CodeGenOpt::Level OptLevel) const {
	return true;
	}

	bool TargetSubtargetInfo::enableAdvancedRASplitCost() const {
	return false;
	}

	bool TargetSubtargetInfo::enablePostRAScheduler() const {
	return getSchedModel().PostRAScheduler;
	}

	bool TargetSubtargetInfo::useAA() const {
	return false;
	}

	static std::string createSchedInfoStr(unsigned Latency,
	Optional<double> RThroughput) {
	static const char *SchedPrefix = " sched: [";
	std::string Comment;
	raw_string_ostream CS(Comment);
	if (Latency > 0 && RThroughput.hasValue())
	CS << SchedPrefix << Latency << format(":%2.2f", RThroughput.getValue())
	<< "]";
	else if (Latency > 0)
	CS << SchedPrefix << Latency << ":?]";
	else if (RThroughput.hasValue())
	CS << SchedPrefix << "?:" << RThroughput.getValue() << "]";
	CS.flush();
	return Comment;
	}

	/// Returns string representation of scheduler comment
	std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const {
	if (MI.isPseudo() \|\| MI.isTerminator())
	return std::string();
	// We don't cache TSchedModel because it depends on TargetInstrInfo
	// that could be changed during the compilation
	TargetSchedModel TSchedModel;
	TSchedModel.init(getSchedModel(), this, getInstrInfo());
	unsigned Latency = TSchedModel.computeInstrLatency(&MI);
	Optional<double> RThroughput = TSchedModel.computeInstrRThroughput(&MI);
	return createSchedInfoStr(Latency, RThroughput);
	}

	/// Returns string representation of scheduler comment
	std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const {
	// We don't cache TSchedModel because it depends on TargetInstrInfo
	// that could be changed during the compilation
	TargetSchedModel TSchedModel;
	TSchedModel.init(getSchedModel(), this, getInstrInfo());
	unsigned Latency;
	if (TSchedModel.hasInstrSchedModel())
	Latency = TSchedModel.computeInstrLatency(MCI.getOpcode());
	else if (TSchedModel.hasInstrItineraries()) {
	auto *ItinData = TSchedModel.getInstrItineraries();
	Latency = ItinData->getStageLatency(
	getInstrInfo()->get(MCI.getOpcode()).getSchedClass());
	} else
	return std::string();
	Optional<double> RThroughput =
	TSchedModel.computeInstrRThroughput(MCI.getOpcode());
	return createSchedInfoStr(Latency, RThroughput);
	}
	Index: head/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (revision 328817)
	@@ -1,4819 +1,4828 @@
	//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// \brief SI Implementation of TargetInstrInfo.
	//
	//===----------------------------------------------------------------------===//

	#include "SIInstrInfo.h"
	#include "AMDGPU.h"
	#include "AMDGPUSubtarget.h"
	#include "GCNHazardRecognizer.h"
	#include "SIDefines.h"
	#include "SIMachineFunctionInfo.h"
	#include "SIRegisterInfo.h"
	#include "Utils/AMDGPUBaseInfo.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/MemoryLocation.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineInstrBundle.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/RegisterScavenging.h"
	#include "llvm/CodeGen/ScheduleDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetOpcodes.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetMachine.h"
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <utility>

	using namespace llvm;

	// Must be at least 4 to be able to branch over minimum unconditional branch
	// code. This is only for making it possible to write reasonably small tests for
	// long branches.
	static cl::opt<unsigned>
	BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
	cl::desc("Restrict range of branch instructions (DEBUG)"));

	SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
	: AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}

	//===----------------------------------------------------------------------===//
	// TargetInstrInfo callbacks
	//===----------------------------------------------------------------------===//

	static unsigned getNumOperandsNoGlue(SDNode *Node) {
	unsigned N = Node->getNumOperands();
	while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
	--N;
	return N;
	}

	static SDValue findChainOperand(SDNode *Load) {
	SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
	assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
	return LastOp;
	}

	/// \brief Returns true if both nodes have the same value for the given
	/// operand \p Op, or if both nodes do not have this operand.
	static bool nodesHaveSameOperandValue(SDNode N0, SDNode N1, unsigned OpName) {
	unsigned Opc0 = N0->getMachineOpcode();
	unsigned Opc1 = N1->getMachineOpcode();

	int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
	int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);

	if (Op0Idx == -1 && Op1Idx == -1)
	return true;


	if ((Op0Idx == -1 && Op1Idx != -1) \|\|
	(Op1Idx == -1 && Op0Idx != -1))
	return false;

	// getNamedOperandIdx returns the index for the MachineInstr's operands,
	// which includes the result as the first operand. We are indexing into the
	// MachineSDNode's operands, so we need to skip the result operand to get
	// the real index.
	--Op0Idx;
	--Op1Idx;

	return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
	}

	bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
	AliasAnalysis *AA) const {
	// TODO: The generic check fails for VALU instructions that should be
	// rematerializable due to implicit reads of exec. We really want all of the
	// generic logic for this except for this.
	switch (MI.getOpcode()) {
	case AMDGPU::V_MOV_B32_e32:
	case AMDGPU::V_MOV_B32_e64:
	case AMDGPU::V_MOV_B64_PSEUDO:
	return true;
	default:
	return false;
	}
	}

	bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode Load0, SDNode Load1,
	int64_t &Offset0,
	int64_t &Offset1) const {
	if (!Load0->isMachineOpcode() \|\| !Load1->isMachineOpcode())
	return false;

	unsigned Opc0 = Load0->getMachineOpcode();
	unsigned Opc1 = Load1->getMachineOpcode();

	// Make sure both are actually loads.
	if (!get(Opc0).mayLoad() \|\| !get(Opc1).mayLoad())
	return false;

	if (isDS(Opc0) && isDS(Opc1)) {

	// FIXME: Handle this case:
	if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
	return false;

	// Check base reg.
	if (Load0->getOperand(1) != Load1->getOperand(1))
	return false;

	// Check chain.
	if (findChainOperand(Load0) != findChainOperand(Load1))
	return false;

	// Skip read2 / write2 variants for simplicity.
	// TODO: We should report true if the used offsets are adjacent (excluded
	// st64 versions).
	if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 \|\|
	AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
	return false;

	Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
	Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
	return true;
	}

	if (isSMRD(Opc0) && isSMRD(Opc1)) {
	// Skip time and cache invalidation instructions.
	if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 \|\|
	AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
	return false;

	assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));

	// Check base reg.
	if (Load0->getOperand(0) != Load1->getOperand(0))
	return false;

	const ConstantSDNode *Load0Offset =
	dyn_cast<ConstantSDNode>(Load0->getOperand(1));
	const ConstantSDNode *Load1Offset =
	dyn_cast<ConstantSDNode>(Load1->getOperand(1));

	if (!Load0Offset \|\| !Load1Offset)
	return false;

	// Check chain.
	if (findChainOperand(Load0) != findChainOperand(Load1))
	return false;

	Offset0 = Load0Offset->getZExtValue();
	Offset1 = Load1Offset->getZExtValue();
	return true;
	}

	// MUBUF and MTBUF can access the same addresses.
	if ((isMUBUF(Opc0) \|\| isMTBUF(Opc0)) && (isMUBUF(Opc1) \|\| isMTBUF(Opc1))) {

	// MUBUF and MTBUF have vaddr at different indices.
	if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) \|\|
	findChainOperand(Load0) != findChainOperand(Load1) \|\|
	!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) \|\|
	!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
	return false;

	int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
	int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);

	if (OffIdx0 == -1 \|\| OffIdx1 == -1)
	return false;

	// getNamedOperandIdx returns the index for MachineInstrs. Since they
	// inlcude the output in the operand list, but SDNodes don't, we need to
	// subtract the index by one.
	--OffIdx0;
	--OffIdx1;

	SDValue Off0 = Load0->getOperand(OffIdx0);
	SDValue Off1 = Load1->getOperand(OffIdx1);

	// The offset might be a FrameIndexSDNode.
	if (!isa<ConstantSDNode>(Off0) \|\| !isa<ConstantSDNode>(Off1))
	return false;

	Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
	Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
	return true;
	}

	return false;
	}

	static bool isStride64(unsigned Opc) {
	switch (Opc) {
	case AMDGPU::DS_READ2ST64_B32:
	case AMDGPU::DS_READ2ST64_B64:
	case AMDGPU::DS_WRITE2ST64_B32:
	case AMDGPU::DS_WRITE2ST64_B64:
	return true;
	default:
	return false;
	}
	}

	bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
	int64_t &Offset,
	const TargetRegisterInfo *TRI) const {
	unsigned Opc = LdSt.getOpcode();

	if (isDS(LdSt)) {
	const MachineOperand *OffsetImm =
	getNamedOperand(LdSt, AMDGPU::OpName::offset);
	if (OffsetImm) {
	// Normal, single offset LDS instruction.
	const MachineOperand *AddrReg =
	getNamedOperand(LdSt, AMDGPU::OpName::addr);

	BaseReg = AddrReg->getReg();
	Offset = OffsetImm->getImm();
	return true;
	}

	// The 2 offset instructions use offset0 and offset1 instead. We can treat
	// these as a load with a single offset if the 2 offsets are consecutive. We
	// will use this for some partially aligned loads.
	const MachineOperand *Offset0Imm =
	getNamedOperand(LdSt, AMDGPU::OpName::offset0);
	const MachineOperand *Offset1Imm =
	getNamedOperand(LdSt, AMDGPU::OpName::offset1);

	uint8_t Offset0 = Offset0Imm->getImm();
	uint8_t Offset1 = Offset1Imm->getImm();

	if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
	// Each of these offsets is in element sized units, so we need to convert
	// to bytes of the individual reads.

	unsigned EltSize;
	if (LdSt.mayLoad())
	EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
	else {
	assert(LdSt.mayStore());
	int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
	EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
	}

	if (isStride64(Opc))
	EltSize *= 64;

	const MachineOperand *AddrReg =
	getNamedOperand(LdSt, AMDGPU::OpName::addr);
	BaseReg = AddrReg->getReg();
	Offset = EltSize * Offset0;
	return true;
	}

	return false;
	}

	if (isMUBUF(LdSt) \|\| isMTBUF(LdSt)) {
	const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
	if (SOffset && SOffset->isReg())
	return false;

	const MachineOperand *AddrReg =
	getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
	if (!AddrReg)
	return false;

	const MachineOperand *OffsetImm =
	getNamedOperand(LdSt, AMDGPU::OpName::offset);
	BaseReg = AddrReg->getReg();
	Offset = OffsetImm->getImm();

	if (SOffset) // soffset can be an inline immediate.
	Offset += SOffset->getImm();

	return true;
	}

	if (isSMRD(LdSt)) {
	const MachineOperand *OffsetImm =
	getNamedOperand(LdSt, AMDGPU::OpName::offset);
	if (!OffsetImm)
	return false;

	const MachineOperand *SBaseReg =
	getNamedOperand(LdSt, AMDGPU::OpName::sbase);
	BaseReg = SBaseReg->getReg();
	Offset = OffsetImm->getImm();
	return true;
	}

	if (isFLAT(LdSt)) {
	const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
	if (VAddr) {
	// Can't analyze 2 offsets.
	if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
	return false;

	BaseReg = VAddr->getReg();
	} else {
	// scratch instructions have either vaddr or saddr.
	BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg();
	}

	Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
	return true;
	}

	return false;
	}

	static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
	const MachineInstr &MI2, unsigned BaseReg2) {
	if (BaseReg1 == BaseReg2)
	return true;

	if (!MI1.hasOneMemOperand() \|\| !MI2.hasOneMemOperand())
	return false;

	auto MO1 = *MI1.memoperands_begin();
	auto MO2 = *MI2.memoperands_begin();
	if (MO1->getAddrSpace() != MO2->getAddrSpace())
	return false;

	auto Base1 = MO1->getValue();
	auto Base2 = MO2->getValue();
	if (!Base1 \|\| !Base2)
	return false;
	const MachineFunction &MF = *MI1.getParent()->getParent();
	const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
	Base1 = GetUnderlyingObject(Base1, DL);
	Base2 = GetUnderlyingObject(Base1, DL);

	if (isa<UndefValue>(Base1) \|\| isa<UndefValue>(Base2))
	return false;

	return Base1 == Base2;
	}

	bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
	unsigned BaseReg1,
	MachineInstr &SecondLdSt,
	unsigned BaseReg2,
	unsigned NumLoads) const {
	if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2))
	return false;

	const MachineOperand *FirstDst = nullptr;
	const MachineOperand *SecondDst = nullptr;

	if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) \|\|
	(isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) \|\|
	(isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
	const unsigned MaxGlobalLoadCluster = 6;
	if (NumLoads > MaxGlobalLoadCluster)
	return false;

	FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
	if (!FirstDst)
	FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
	SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
	if (!SecondDst)
	SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
	} else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
	FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
	SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
	} else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
	FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
	SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
	}

	if (!FirstDst \|\| !SecondDst)
	return false;

	// Try to limit clustering based on the total number of bytes loaded
	// rather than the number of instructions. This is done to help reduce
	// register pressure. The method used is somewhat inexact, though,
	// because it assumes that all loads in the cluster will load the
	// same number of bytes as FirstLdSt.

	// The unit of this value is bytes.
	// FIXME: This needs finer tuning.
	unsigned LoadClusterThreshold = 16;

	const MachineRegisterInfo &MRI =
	FirstLdSt.getParent()->getParent()->getRegInfo();
	const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());

	return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
	}

	static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	const DebugLoc &DL, unsigned DestReg,
	unsigned SrcReg, bool KillSrc) {
	MachineFunction *MF = MBB.getParent();
	DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
	"illegal SGPR to VGPR copy",
	DL, DS_Error);
	LLVMContext &C = MF->getFunction().getContext();
	C.diagnose(IllegalCopy);

	BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}

	void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	const DebugLoc &DL, unsigned DestReg,
	unsigned SrcReg, bool KillSrc) const {
	const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);

	if (RC == &AMDGPU::VGPR_32RegClass) {
	assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) \|\|
	AMDGPU::SReg_32RegClass.contains(SrcReg));
	BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}

	if (RC == &AMDGPU::SReg_32_XM0RegClass \|\|
	RC == &AMDGPU::SReg_32RegClass) {
	if (SrcReg == AMDGPU::SCC) {
	BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
	.addImm(-1)
	.addImm(0);
	return;
	}

	if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
	reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
	return;
	}

	BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}

	if (RC == &AMDGPU::SReg_64RegClass) {
	if (DestReg == AMDGPU::VCC) {
	if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
	BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
	.addReg(SrcReg, getKillRegState(KillSrc));
	} else {
	// FIXME: Hack until VReg_1 removed.
	assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
	BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
	.addImm(0)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}

	return;
	}

	if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
	reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
	return;
	}

	BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}

	if (DestReg == AMDGPU::SCC) {
	assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
	BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
	.addReg(SrcReg, getKillRegState(KillSrc))
	.addImm(0);
	return;
	}

	unsigned EltSize = 4;
	unsigned Opcode = AMDGPU::V_MOV_B32_e32;
	if (RI.isSGPRClass(RC)) {
	if (RI.getRegSizeInBits(*RC) > 32) {
	Opcode = AMDGPU::S_MOV_B64;
	EltSize = 8;
	} else {
	Opcode = AMDGPU::S_MOV_B32;
	EltSize = 4;
	}

	if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
	reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
	return;
	}
	}

	ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
	bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);

	for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
	unsigned SubIdx;
	if (Forward)
	SubIdx = SubIndices[Idx];
	else
	SubIdx = SubIndices[SubIndices.size() - Idx - 1];

	MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
	get(Opcode), RI.getSubReg(DestReg, SubIdx));

	Builder.addReg(RI.getSubReg(SrcReg, SubIdx));

	if (Idx == 0)
	Builder.addReg(DestReg, RegState::Define \| RegState::Implicit);

	bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
	Builder.addReg(SrcReg, getKillRegState(UseKill) \| RegState::Implicit);
	}
	}

	int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
	int NewOpc;

	// Try to map original to commuted opcode
	NewOpc = AMDGPU::getCommuteRev(Opcode);
	if (NewOpc != -1)
	// Check if the commuted (REV) opcode exists on the target.
	return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;

	// Try to map commuted to original opcode
	NewOpc = AMDGPU::getCommuteOrig(Opcode);
	if (NewOpc != -1)
	// Check if the original (non-REV) opcode exists on the target.
	return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;

	return Opcode;
	}

	void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	const DebugLoc &DL, unsigned DestReg,
	int64_t Value) const {
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
	if (RegClass == &AMDGPU::SReg_32RegClass \|\|
	RegClass == &AMDGPU::SGPR_32RegClass \|\|
	RegClass == &AMDGPU::SReg_32_XM0RegClass \|\|
	RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
	BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
	.addImm(Value);
	return;
	}

	if (RegClass == &AMDGPU::SReg_64RegClass \|\|
	RegClass == &AMDGPU::SGPR_64RegClass \|\|
	RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
	BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
	.addImm(Value);
	return;
	}

	if (RegClass == &AMDGPU::VGPR_32RegClass) {
	BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
	.addImm(Value);
	return;
	}
	if (RegClass == &AMDGPU::VReg_64RegClass) {
	BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
	.addImm(Value);
	return;
	}

	unsigned EltSize = 4;
	unsigned Opcode = AMDGPU::V_MOV_B32_e32;
	if (RI.isSGPRClass(RegClass)) {
	if (RI.getRegSizeInBits(*RegClass) > 32) {
	Opcode = AMDGPU::S_MOV_B64;
	EltSize = 8;
	} else {
	Opcode = AMDGPU::S_MOV_B32;
	EltSize = 4;
	}
	}

	ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
	for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
	int64_t IdxValue = Idx == 0 ? Value : 0;

	MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
	get(Opcode), RI.getSubReg(DestReg, Idx));
	Builder.addImm(IdxValue);
	}
	}

	const TargetRegisterClass *
	SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
	return &AMDGPU::VGPR_32RegClass;
	}

	void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL, unsigned DstReg,
	ArrayRef<MachineOperand> Cond,
	unsigned TrueReg,
	unsigned FalseReg) const {
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
	"Not a VGPR32 reg");

	if (Cond.size() == 1) {
	unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
	BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
	.add(Cond[0]);
	BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
	.addReg(FalseReg)
	.addReg(TrueReg)
	.addReg(SReg);
	} else if (Cond.size() == 2) {
	assert(Cond[0].isImm() && "Cond[0] is not an immediate");
	switch (Cond[0].getImm()) {
	case SIInstrInfo::SCC_TRUE: {
	unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
	BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
	.addImm(-1)
	.addImm(0);
	BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
	.addReg(FalseReg)
	.addReg(TrueReg)
	.addReg(SReg);
	break;
	}
	case SIInstrInfo::SCC_FALSE: {
	unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
	BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
	.addImm(0)
	.addImm(-1);
	BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
	.addReg(FalseReg)
	.addReg(TrueReg)
	.addReg(SReg);
	break;
	}
	case SIInstrInfo::VCCNZ: {
	MachineOperand RegOp = Cond[1];
	RegOp.setImplicit(false);
	unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
	BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
	.add(RegOp);
	BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
	.addReg(FalseReg)
	.addReg(TrueReg)
	.addReg(SReg);
	break;
	}
	case SIInstrInfo::VCCZ: {
	MachineOperand RegOp = Cond[1];
	RegOp.setImplicit(false);
	unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
	BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
	.add(RegOp);
	BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
	.addReg(TrueReg)
	.addReg(FalseReg)
	.addReg(SReg);
	break;
	}
	case SIInstrInfo::EXECNZ: {
	unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
	unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
	BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
	.addImm(0);
	BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
	.addImm(-1)
	.addImm(0);
	BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
	.addReg(FalseReg)
	.addReg(TrueReg)
	.addReg(SReg);
	break;
	}
	case SIInstrInfo::EXECZ: {
	unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
	unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
	BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
	.addImm(0);
	BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
	.addImm(0)
	.addImm(-1);
	BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
	.addReg(FalseReg)
	.addReg(TrueReg)
	.addReg(SReg);
	llvm_unreachable("Unhandled branch predicate EXECZ");
	break;
	}
	default:
	llvm_unreachable("invalid branch predicate");
	}
	} else {
	llvm_unreachable("Can only handle Cond size 1 or 2");
	}
	}

	unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL,
	unsigned SrcReg, int Value) const {
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
	BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
	.addImm(Value)
	.addReg(SrcReg);

	return Reg;
	}

	unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL,
	unsigned SrcReg, int Value) const {
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
	BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
	.addImm(Value)
	.addReg(SrcReg);

	return Reg;
	}

	unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {

	if (RI.getRegSizeInBits(*DstRC) == 32) {
	return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
	} else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
	return AMDGPU::S_MOV_B64;
	} else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
	return AMDGPU::V_MOV_B64_PSEUDO;
	}
	return AMDGPU::COPY;
	}

	static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
	switch (Size) {
	case 4:
	return AMDGPU::SI_SPILL_S32_SAVE;
	case 8:
	return AMDGPU::SI_SPILL_S64_SAVE;
	case 16:
	return AMDGPU::SI_SPILL_S128_SAVE;
	case 32:
	return AMDGPU::SI_SPILL_S256_SAVE;
	case 64:
	return AMDGPU::SI_SPILL_S512_SAVE;
	default:
	llvm_unreachable("unknown register size");
	}
	}

	static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
	switch (Size) {
	case 4:
	return AMDGPU::SI_SPILL_V32_SAVE;
	case 8:
	return AMDGPU::SI_SPILL_V64_SAVE;
	case 12:
	return AMDGPU::SI_SPILL_V96_SAVE;
	case 16:
	return AMDGPU::SI_SPILL_V128_SAVE;
	case 32:
	return AMDGPU::SI_SPILL_V256_SAVE;
	case 64:
	return AMDGPU::SI_SPILL_V512_SAVE;
	default:
	llvm_unreachable("unknown register size");
	}
	}

	void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	unsigned SrcReg, bool isKill,
	int FrameIndex,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	MachineFunction *MF = MBB.getParent();
	SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
	MachineFrameInfo &FrameInfo = MF->getFrameInfo();
	DebugLoc DL = MBB.findDebugLoc(MI);

	assert(SrcReg != MFI->getStackPtrOffsetReg() &&
	SrcReg != MFI->getFrameOffsetReg() &&
	SrcReg != MFI->getScratchWaveOffsetReg());

	unsigned Size = FrameInfo.getObjectSize(FrameIndex);
	unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
	MachinePointerInfo PtrInfo
	= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
	MachineMemOperand *MMO
	= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
	Size, Align);
	unsigned SpillSize = TRI->getSpillSize(*RC);

	if (RI.isSGPRClass(RC)) {
	MFI->setHasSpilledSGPRs();

	// We are only allowed to create one new instruction when spilling
	// registers, so we need to use pseudo instruction for spilling SGPRs.
	const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));

	// The SGPR spill/restore instructions only work on number sgprs, so we need
	// to make sure we are using the correct register class.
	if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
	MachineRegisterInfo &MRI = MF->getRegInfo();
	MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
	}

	MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
	.addReg(SrcReg, getKillRegState(isKill)) // data
	.addFrameIndex(FrameIndex) // addr
	.addMemOperand(MMO)
	.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
	.addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
	// Add the scratch resource registers as implicit uses because we may end up
	// needing them, and need to ensure that the reserved registers are
	// correctly handled.

	FrameInfo.setStackID(FrameIndex, 1);
	if (ST.hasScalarStores()) {
	// m0 is used for offset to scalar stores if used to spill.
	Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine \| RegState::Dead);
	}

	return;
	}

	if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
	LLVMContext &Ctx = MF->getFunction().getContext();
	Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
	" spill register");
	BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
	.addReg(SrcReg);

	return;
	}

	assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");

	unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
	MFI->setHasSpilledVGPRs();
	BuildMI(MBB, MI, DL, get(Opcode))
	.addReg(SrcReg, getKillRegState(isKill)) // data
	.addFrameIndex(FrameIndex) // addr
	.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
	.addReg(MFI->getFrameOffsetReg()) // scratch_offset
	.addImm(0) // offset
	.addMemOperand(MMO);
	}

	static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
	switch (Size) {
	case 4:
	return AMDGPU::SI_SPILL_S32_RESTORE;
	case 8:
	return AMDGPU::SI_SPILL_S64_RESTORE;
	case 16:
	return AMDGPU::SI_SPILL_S128_RESTORE;
	case 32:
	return AMDGPU::SI_SPILL_S256_RESTORE;
	case 64:
	return AMDGPU::SI_SPILL_S512_RESTORE;
	default:
	llvm_unreachable("unknown register size");
	}
	}

	static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
	switch (Size) {
	case 4:
	return AMDGPU::SI_SPILL_V32_RESTORE;
	case 8:
	return AMDGPU::SI_SPILL_V64_RESTORE;
	case 12:
	return AMDGPU::SI_SPILL_V96_RESTORE;
	case 16:
	return AMDGPU::SI_SPILL_V128_RESTORE;
	case 32:
	return AMDGPU::SI_SPILL_V256_RESTORE;
	case 64:
	return AMDGPU::SI_SPILL_V512_RESTORE;
	default:
	llvm_unreachable("unknown register size");
	}
	}

	void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	unsigned DestReg, int FrameIndex,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	MachineFunction *MF = MBB.getParent();
	const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
	MachineFrameInfo &FrameInfo = MF->getFrameInfo();
	DebugLoc DL = MBB.findDebugLoc(MI);
	unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
	unsigned Size = FrameInfo.getObjectSize(FrameIndex);
	unsigned SpillSize = TRI->getSpillSize(*RC);

	MachinePointerInfo PtrInfo
	= MachinePointerInfo::getFixedStack(*MF, FrameIndex);

	MachineMemOperand *MMO = MF->getMachineMemOperand(
	PtrInfo, MachineMemOperand::MOLoad, Size, Align);

	if (RI.isSGPRClass(RC)) {
	// FIXME: Maybe this should not include a memoperand because it will be
	// lowered to non-memory instructions.
	const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
	if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
	MachineRegisterInfo &MRI = MF->getRegInfo();
	MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
	}

	FrameInfo.setStackID(FrameIndex, 1);
	MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
	.addFrameIndex(FrameIndex) // addr
	.addMemOperand(MMO)
	.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
	.addReg(MFI->getFrameOffsetReg(), RegState::Implicit);

	if (ST.hasScalarStores()) {
	// m0 is used for offset to scalar stores if used to spill.
	Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine \| RegState::Dead);
	}

	return;
	}

	if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
	LLVMContext &Ctx = MF->getFunction().getContext();
	Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
	" restore register");
	BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);

	return;
	}

	assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");

	unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
	BuildMI(MBB, MI, DL, get(Opcode), DestReg)
	.addFrameIndex(FrameIndex) // vaddr
	.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
	.addReg(MFI->getFrameOffsetReg()) // scratch_offset
	.addImm(0) // offset
	.addMemOperand(MMO);
	}

	/// \param @Offset Offset in bytes of the FrameIndex being spilled
	unsigned SIInstrInfo::calculateLDSSpillAddress(
	MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
	unsigned FrameOffset, unsigned Size) const {
	MachineFunction *MF = MBB.getParent();
	SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
	const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
	DebugLoc DL = MBB.findDebugLoc(MI);
	unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
	unsigned WavefrontSize = ST.getWavefrontSize();

	unsigned TIDReg = MFI->getTIDReg();
	if (!MFI->hasCalculatedTID()) {
	MachineBasicBlock &Entry = MBB.getParent()->front();
	MachineBasicBlock::iterator Insert = Entry.front();
	DebugLoc DL = Insert->getDebugLoc();

	TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
	*MF);
	if (TIDReg == AMDGPU::NoRegister)
	return TIDReg;

	if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) &&
	WorkGroupSize > WavefrontSize) {
	unsigned TIDIGXReg
	= MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
	unsigned TIDIGYReg
	= MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
	unsigned TIDIGZReg
	= MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
	unsigned InputPtrReg =
	MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
	for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
	if (!Entry.isLiveIn(Reg))
	Entry.addLiveIn(Reg);
	}

	RS->enterBasicBlock(Entry);
	// FIXME: Can we scavenge an SReg_64 and access the subregs?
	unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
	unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
	BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
	.addReg(InputPtrReg)
	.addImm(SI::KernelInputOffsets::NGROUPS_Z);
	BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
	.addReg(InputPtrReg)
	.addImm(SI::KernelInputOffsets::NGROUPS_Y);

	// NGROUPS.X * NGROUPS.Y
	BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
	.addReg(STmp1)
	.addReg(STmp0);
	// (NGROUPS.X * NGROUPS.Y) * TIDIG.X
	BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
	.addReg(STmp1)
	.addReg(TIDIGXReg);
	// NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
	BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
	.addReg(STmp0)
	.addReg(TIDIGYReg)
	.addReg(TIDReg);
	// (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
	getAddNoCarry(Entry, Insert, DL, TIDReg)
	.addReg(TIDReg)
	.addReg(TIDIGZReg);
	} else {
	// Get the wave id
	BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
	TIDReg)
	.addImm(-1)
	.addImm(0);

	BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
	TIDReg)
	.addImm(-1)
	.addReg(TIDReg);
	}

	BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
	TIDReg)
	.addImm(2)
	.addReg(TIDReg);
	MFI->setTIDReg(TIDReg);
	}

	// Add FrameIndex to LDS offset
	unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
	getAddNoCarry(MBB, MI, DL, TmpReg)
	.addImm(LDSOffset)
	.addReg(TIDReg);

	return TmpReg;
	}

	void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	int Count) const {
	DebugLoc DL = MBB.findDebugLoc(MI);
	while (Count > 0) {
	int Arg;
	if (Count >= 8)
	Arg = 7;
	else
	Arg = Count - 1;
	Count -= 8;
	BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
	.addImm(Arg);
	}
	}

	void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI) const {
	insertWaitStates(MBB, MI, 1);
	}

	void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
	auto MF = MBB.getParent();
	SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();

	assert(Info->isEntryFunction());

	if (MBB.succ_empty()) {
	bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
	if (HasNoTerminator)
	BuildMI(MBB, MBB.end(), DebugLoc(),
	get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
	}
	}

	unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
	switch (MI.getOpcode()) {
	default: return 1; // FIXME: Do wait states equal cycles?

	case AMDGPU::S_NOP:
	return MI.getOperand(0).getImm() + 1;
	}
	}

	bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
	MachineBasicBlock &MBB = *MI.getParent();
	DebugLoc DL = MBB.findDebugLoc(MI);
	switch (MI.getOpcode()) {
	default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
	case AMDGPU::S_MOV_B64_term:
	// This is only a terminator to get the correct spill code placement during
	// register allocation.
	MI.setDesc(get(AMDGPU::S_MOV_B64));
	break;

	case AMDGPU::S_XOR_B64_term:
	// This is only a terminator to get the correct spill code placement during
	// register allocation.
	MI.setDesc(get(AMDGPU::S_XOR_B64));
	break;

	case AMDGPU::S_ANDN2_B64_term:
	// This is only a terminator to get the correct spill code placement during
	// register allocation.
	MI.setDesc(get(AMDGPU::S_ANDN2_B64));
	break;

	case AMDGPU::V_MOV_B64_PSEUDO: {
	unsigned Dst = MI.getOperand(0).getReg();
	unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
	unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);

	const MachineOperand &SrcOp = MI.getOperand(1);
	// FIXME: Will this work for 64-bit floating point immediates?
	assert(!SrcOp.isFPImm());
	if (SrcOp.isImm()) {
	APInt Imm(64, SrcOp.getImm());
	BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
	.addImm(Imm.getLoBits(32).getZExtValue())
	.addReg(Dst, RegState::Implicit \| RegState::Define);
	BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
	.addImm(Imm.getHiBits(32).getZExtValue())
	.addReg(Dst, RegState::Implicit \| RegState::Define);
	} else {
	assert(SrcOp.isReg());
	BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
	.addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
	.addReg(Dst, RegState::Implicit \| RegState::Define);
	BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
	.addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
	.addReg(Dst, RegState::Implicit \| RegState::Define);
	}
	MI.eraseFromParent();
	break;
	}
	case AMDGPU::V_SET_INACTIVE_B32: {
	BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
	.addReg(AMDGPU::EXEC);
	BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
	.add(MI.getOperand(2));
	BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
	.addReg(AMDGPU::EXEC);
	MI.eraseFromParent();
	break;
	}
	case AMDGPU::V_SET_INACTIVE_B64: {
	BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
	.addReg(AMDGPU::EXEC);
	MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
	MI.getOperand(0).getReg())
	.add(MI.getOperand(2));
	expandPostRAPseudo(*Copy);
	BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
	.addReg(AMDGPU::EXEC);
	MI.eraseFromParent();
	break;
	}
	case AMDGPU::V_MOVRELD_B32_V1:
	case AMDGPU::V_MOVRELD_B32_V2:
	case AMDGPU::V_MOVRELD_B32_V4:
	case AMDGPU::V_MOVRELD_B32_V8:
	case AMDGPU::V_MOVRELD_B32_V16: {
	const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
	unsigned VecReg = MI.getOperand(0).getReg();
	bool IsUndef = MI.getOperand(1).isUndef();
	unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
	assert(VecReg == MI.getOperand(1).getReg());

	MachineInstr *MovRel =
	BuildMI(MBB, MI, DL, MovRelDesc)
	.addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
	.add(MI.getOperand(2))
	.addReg(VecReg, RegState::ImplicitDefine)
	.addReg(VecReg,
	RegState::Implicit \| (IsUndef ? RegState::Undef : 0));

	const int ImpDefIdx =
	MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
	const int ImpUseIdx = ImpDefIdx + 1;
	MovRel->tieOperands(ImpDefIdx, ImpUseIdx);

	MI.eraseFromParent();
	break;
	}
	case AMDGPU::SI_PC_ADD_REL_OFFSET: {
	MachineFunction &MF = *MBB.getParent();
	unsigned Reg = MI.getOperand(0).getReg();
	unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
	unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);

	// Create a bundle so these instructions won't be re-ordered by the
	// post-RA scheduler.
	MIBundleBuilder Bundler(MBB, MI);
	Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));

	// Add 32-bit offset from this instruction to the start of the
	// constant data.
	Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
	.addReg(RegLo)
	.add(MI.getOperand(1)));

	MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
	.addReg(RegHi);
	if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
	MIB.addImm(0);
	else
	MIB.add(MI.getOperand(2));

	Bundler.append(MIB);
	finalizeBundle(MBB, Bundler.begin());

	MI.eraseFromParent();
	break;
	}
	case AMDGPU::EXIT_WWM: {
	// This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
	// is exited.
	MI.setDesc(get(AMDGPU::S_MOV_B64));
	break;
	}
	}
	return true;
	}

	bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
	MachineOperand &Src0,
	unsigned Src0OpName,
	MachineOperand &Src1,
	unsigned Src1OpName) const {
	MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
	if (!Src0Mods)
	return false;

	MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
	assert(Src1Mods &&
	"All commutable instructions have both src0 and src1 modifiers");

	int Src0ModsVal = Src0Mods->getImm();
	int Src1ModsVal = Src1Mods->getImm();

	Src1Mods->setImm(Src0ModsVal);
	Src0Mods->setImm(Src1ModsVal);
	return true;
	}

	static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
	MachineOperand &RegOp,
	MachineOperand &NonRegOp) {
	unsigned Reg = RegOp.getReg();
	unsigned SubReg = RegOp.getSubReg();
	bool IsKill = RegOp.isKill();
	bool IsDead = RegOp.isDead();
	bool IsUndef = RegOp.isUndef();
	bool IsDebug = RegOp.isDebug();

	if (NonRegOp.isImm())
	RegOp.ChangeToImmediate(NonRegOp.getImm());
	else if (NonRegOp.isFI())
	RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
	else
	return nullptr;

	NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
	NonRegOp.setSubReg(SubReg);

	return &MI;
	}

	MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
	unsigned Src0Idx,
	unsigned Src1Idx) const {
	assert(!NewMI && "this should never be used");

	unsigned Opc = MI.getOpcode();
	int CommutedOpcode = commuteOpcode(Opc);
	if (CommutedOpcode == -1)
	return nullptr;

	assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
	static_cast<int>(Src0Idx) &&
	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
	static_cast<int>(Src1Idx) &&
	"inconsistency with findCommutedOpIndices");

	MachineOperand &Src0 = MI.getOperand(Src0Idx);
	MachineOperand &Src1 = MI.getOperand(Src1Idx);

	MachineInstr *CommutedMI = nullptr;
	if (Src0.isReg() && Src1.isReg()) {
	if (isOperandLegal(MI, Src1Idx, &Src0)) {
	// Be sure to copy the source modifiers to the right place.
	CommutedMI
	= TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
	}

	} else if (Src0.isReg() && !Src1.isReg()) {
	// src0 should always be able to support any operand type, so no need to
	// check operand legality.
	CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
	} else if (!Src0.isReg() && Src1.isReg()) {
	if (isOperandLegal(MI, Src1Idx, &Src0))
	CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
	} else {
	// FIXME: Found two non registers to commute. This does happen.
	return nullptr;
	}

	if (CommutedMI) {
	swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
	Src1, AMDGPU::OpName::src1_modifiers);

	CommutedMI->setDesc(get(CommutedOpcode));
	}

	return CommutedMI;
	}

	// This needs to be implemented because the source modifiers may be inserted
	// between the true commutable operands, and the base
	// TargetInstrInfo::commuteInstruction uses it.
	bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
	unsigned &SrcOpIdx1) const {
	if (!MI.isCommutable())
	return false;

	unsigned Opc = MI.getOpcode();
	int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
	if (Src0Idx == -1)
	return false;

	int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
	if (Src1Idx == -1)
	return false;

	return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
	}

	bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
	int64_t BrOffset) const {
	// BranchRelaxation should never have to check s_setpc_b64 because its dest
	// block is unanalyzable.
	assert(BranchOp != AMDGPU::S_SETPC_B64);

	// Convert to dwords.
	BrOffset /= 4;

	// The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
	// from the next instruction.
	BrOffset -= 1;

	return isIntN(BranchOffsetBits, BrOffset);
	}

	MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
	const MachineInstr &MI) const {
	if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
	// This would be a difficult analysis to perform, but can always be legal so
	// there's no need to analyze it.
	return nullptr;
	}

	return MI.getOperand(0).getMBB();
	}

	unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
	MachineBasicBlock &DestBB,
	const DebugLoc &DL,
	int64_t BrOffset,
	RegScavenger *RS) const {
	assert(RS && "RegScavenger required for long branching");
	assert(MBB.empty() &&
	"new block should be inserted for expanding unconditional branch");
	assert(MBB.pred_size() == 1);

	MachineFunction *MF = MBB.getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// FIXME: Virtual register workaround for RegScavenger not working with empty
	// blocks.
	unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);

	auto I = MBB.end();

	// We need to compute the offset relative to the instruction immediately after
	// s_getpc_b64. Insert pc arithmetic code before last terminator.
	MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);

	// TODO: Handle > 32-bit block address.
	if (BrOffset >= 0) {
	BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
	.addReg(PCReg, RegState::Define, AMDGPU::sub0)
	.addReg(PCReg, 0, AMDGPU::sub0)
	.addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
	BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
	.addReg(PCReg, RegState::Define, AMDGPU::sub1)
	.addReg(PCReg, 0, AMDGPU::sub1)
	.addImm(0);
	} else {
	// Backwards branch.
	BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
	.addReg(PCReg, RegState::Define, AMDGPU::sub0)
	.addReg(PCReg, 0, AMDGPU::sub0)
	.addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
	BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
	.addReg(PCReg, RegState::Define, AMDGPU::sub1)
	.addReg(PCReg, 0, AMDGPU::sub1)
	.addImm(0);
	}

	// Insert the indirect branch after the other terminator.
	BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
	.addReg(PCReg);

	// FIXME: If spilling is necessary, this will fail because this scavenger has
	// no emergency stack slots. It is non-trivial to spill in this situation,
	// because the restore code needs to be specially placed after the
	// jump. BranchRelaxation then needs to be made aware of the newly inserted
	// block.
	//
	// If a spill is needed for the pc register pair, we need to insert a spill
	// restore block right before the destination block, and insert a short branch
	// into the old destination block's fallthrough predecessor.
	// e.g.:
	//
	// s_cbranch_scc0 skip_long_branch:
	//
	// long_branch_bb:
	// spill s[8:9]
	// s_getpc_b64 s[8:9]
	// s_add_u32 s8, s8, restore_bb
	// s_addc_u32 s9, s9, 0
	// s_setpc_b64 s[8:9]
	//
	// skip_long_branch:
	// foo;
	//
	// .....
	//
	// dest_bb_fallthrough_predecessor:
	// bar;
	// s_branch dest_bb
	//
	// restore_bb:
	// restore s[8:9]
	// fallthrough dest_bb
	///
	// dest_bb:
	// buzz;

	RS->enterBasicBlockEnd(MBB);
	unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
	MachineBasicBlock::iterator(GetPC), 0);
	MRI.replaceRegWith(PCReg, Scav);
	MRI.clearVirtRegs();
	RS->setRegUsed(Scav);

	return 4 + 8 + 4 + 4;
	}

	unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
	switch (Cond) {
	case SIInstrInfo::SCC_TRUE:
	return AMDGPU::S_CBRANCH_SCC1;
	case SIInstrInfo::SCC_FALSE:
	return AMDGPU::S_CBRANCH_SCC0;
	case SIInstrInfo::VCCNZ:
	return AMDGPU::S_CBRANCH_VCCNZ;
	case SIInstrInfo::VCCZ:
	return AMDGPU::S_CBRANCH_VCCZ;
	case SIInstrInfo::EXECNZ:
	return AMDGPU::S_CBRANCH_EXECNZ;
	case SIInstrInfo::EXECZ:
	return AMDGPU::S_CBRANCH_EXECZ;
	default:
	llvm_unreachable("invalid branch predicate");
	}
	}

	SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
	switch (Opcode) {
	case AMDGPU::S_CBRANCH_SCC0:
	return SCC_FALSE;
	case AMDGPU::S_CBRANCH_SCC1:
	return SCC_TRUE;
	case AMDGPU::S_CBRANCH_VCCNZ:
	return VCCNZ;
	case AMDGPU::S_CBRANCH_VCCZ:
	return VCCZ;
	case AMDGPU::S_CBRANCH_EXECNZ:
	return EXECNZ;
	case AMDGPU::S_CBRANCH_EXECZ:
	return EXECZ;
	default:
	return INVALID_BR;
	}
	}

	bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	MachineBasicBlock *&TBB,
	MachineBasicBlock *&FBB,
	SmallVectorImpl<MachineOperand> &Cond,
	bool AllowModify) const {
	if (I->getOpcode() == AMDGPU::S_BRANCH) {
	// Unconditional Branch
	TBB = I->getOperand(0).getMBB();
	return false;
	}

	MachineBasicBlock *CondBB = nullptr;

	if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
	CondBB = I->getOperand(1).getMBB();
	Cond.push_back(I->getOperand(0));
	} else {
	BranchPredicate Pred = getBranchPredicate(I->getOpcode());
	if (Pred == INVALID_BR)
	return true;

	CondBB = I->getOperand(0).getMBB();
	Cond.push_back(MachineOperand::CreateImm(Pred));
	Cond.push_back(I->getOperand(1)); // Save the branch register.
	}
	++I;

	if (I == MBB.end()) {
	// Conditional branch followed by fall-through.
	TBB = CondBB;
	return false;
	}

	if (I->getOpcode() == AMDGPU::S_BRANCH) {
	TBB = CondBB;
	FBB = I->getOperand(0).getMBB();
	return false;
	}

	return true;
	}

	bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
	MachineBasicBlock *&FBB,
	SmallVectorImpl<MachineOperand> &Cond,
	bool AllowModify) const {
	MachineBasicBlock::iterator I = MBB.getFirstTerminator();
	if (I == MBB.end())
	return false;

	if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
	return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);

	++I;

	// TODO: Should be able to treat as fallthrough?
	if (I == MBB.end())
	return true;

	if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
	return true;

	MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();

	// Specifically handle the case where the conditional branch is to the same
	// destination as the mask branch. e.g.
	//
	// si_mask_branch BB8
	// s_cbranch_execz BB8
	// s_cbranch BB9
	//
	// This is required to understand divergent loops which may need the branches
	// to be relaxed.
	if (TBB != MaskBrDest \|\| Cond.empty())
	return true;

	auto Pred = Cond[0].getImm();
	return (Pred != EXECZ && Pred != EXECNZ);
	}

	unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
	int *BytesRemoved) const {
	MachineBasicBlock::iterator I = MBB.getFirstTerminator();

	unsigned Count = 0;
	unsigned RemovedSize = 0;
	while (I != MBB.end()) {
	MachineBasicBlock::iterator Next = std::next(I);
	if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
	I = Next;
	continue;
	}

	RemovedSize += getInstSizeInBytes(*I);
	I->eraseFromParent();
	++Count;
	I = Next;
	}

	if (BytesRemoved)
	*BytesRemoved = RemovedSize;

	return Count;
	}

	// Copy the flags onto the implicit condition register operand.
	static void preserveCondRegFlags(MachineOperand &CondReg,
	const MachineOperand &OrigCond) {
	CondReg.setIsUndef(OrigCond.isUndef());
	CondReg.setIsKill(OrigCond.isKill());
	}

	unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
	MachineBasicBlock *TBB,
	MachineBasicBlock *FBB,
	ArrayRef<MachineOperand> Cond,
	const DebugLoc &DL,
	int *BytesAdded) const {
	if (!FBB && Cond.empty()) {
	BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
	.addMBB(TBB);
	if (BytesAdded)
	*BytesAdded = 4;
	return 1;
	}

	if(Cond.size() == 1 && Cond[0].isReg()) {
	BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
	.add(Cond[0])
	.addMBB(TBB);
	return 1;
	}

	assert(TBB && Cond[0].isImm());

	unsigned Opcode
	= getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));

	if (!FBB) {
	Cond[1].isUndef();
	MachineInstr *CondBr =
	BuildMI(&MBB, DL, get(Opcode))
	.addMBB(TBB);

	// Copy the flags onto the implicit condition register operand.
	preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);

	if (BytesAdded)
	*BytesAdded = 4;
	return 1;
	}

	assert(TBB && FBB);

	MachineInstr *CondBr =
	BuildMI(&MBB, DL, get(Opcode))
	.addMBB(TBB);
	BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
	.addMBB(FBB);

	MachineOperand &CondReg = CondBr->getOperand(1);
	CondReg.setIsUndef(Cond[1].isUndef());
	CondReg.setIsKill(Cond[1].isKill());

	if (BytesAdded)
	*BytesAdded = 8;

	return 2;
	}

	bool SIInstrInfo::reverseBranchCondition(
	SmallVectorImpl<MachineOperand> &Cond) const {
	if (Cond.size() != 2) {
	return true;
	}

	if (Cond[0].isImm()) {
	Cond[0].setImm(-Cond[0].getImm());
	return false;
	}

	return true;
	}

	bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
	ArrayRef<MachineOperand> Cond,
	unsigned TrueReg, unsigned FalseReg,
	int &CondCycles,
	int &TrueCycles, int &FalseCycles) const {
	switch (Cond[0].getImm()) {
	case VCCNZ:
	case VCCZ: {
	const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
	assert(MRI.getRegClass(FalseReg) == RC);

	int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
	CondCycles = TrueCycles = FalseCycles = NumInsts; // ???

	// Limit to equal cost for branch vs. N v_cndmask_b32s.
	return !RI.isSGPRClass(RC) && NumInsts <= 6;
	}
	case SCC_TRUE:
	case SCC_FALSE: {
	// FIXME: We could insert for VGPRs if we could replace the original compare
	// with a vector one.
	const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
	assert(MRI.getRegClass(FalseReg) == RC);

	int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;

	// Multiples of 8 can do s_cselect_b64
	if (NumInsts % 2 == 0)
	NumInsts /= 2;

	CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
	return RI.isSGPRClass(RC);
	}
	default:
	return false;
	}
	}

	void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I, const DebugLoc &DL,
	unsigned DstReg, ArrayRef<MachineOperand> Cond,
	unsigned TrueReg, unsigned FalseReg) const {
	BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
	if (Pred == VCCZ \|\| Pred == SCC_FALSE) {
	Pred = static_cast<BranchPredicate>(-Pred);
	std::swap(TrueReg, FalseReg);
	}

	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
	unsigned DstSize = RI.getRegSizeInBits(*DstRC);

	if (DstSize == 32) {
	unsigned SelOp = Pred == SCC_TRUE ?
	AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;

	// Instruction's operands are backwards from what is expected.
	MachineInstr *Select =
	BuildMI(MBB, I, DL, get(SelOp), DstReg)
	.addReg(FalseReg)
	.addReg(TrueReg);

	preserveCondRegFlags(Select->getOperand(3), Cond[1]);
	return;
	}

	if (DstSize == 64 && Pred == SCC_TRUE) {
	MachineInstr *Select =
	BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
	.addReg(FalseReg)
	.addReg(TrueReg);

	preserveCondRegFlags(Select->getOperand(3), Cond[1]);
	return;
	}

	static const int16_t Sub0_15[] = {
	AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
	AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
	AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
	AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
	};

	static const int16_t Sub0_15_64[] = {
	AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
	AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
	AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
	AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
	};

	unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
	const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
	const int16_t *SubIndices = Sub0_15;
	int NElts = DstSize / 32;

	// 64-bit select is only avaialble for SALU.
	if (Pred == SCC_TRUE) {
	SelOp = AMDGPU::S_CSELECT_B64;
	EltRC = &AMDGPU::SGPR_64RegClass;
	SubIndices = Sub0_15_64;

	assert(NElts % 2 == 0);
	NElts /= 2;
	}

	MachineInstrBuilder MIB = BuildMI(
	MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);

	I = MIB->getIterator();

	SmallVector<unsigned, 8> Regs;
	for (int Idx = 0; Idx != NElts; ++Idx) {
	unsigned DstElt = MRI.createVirtualRegister(EltRC);
	Regs.push_back(DstElt);

	unsigned SubIdx = SubIndices[Idx];

	MachineInstr *Select =
	BuildMI(MBB, I, DL, get(SelOp), DstElt)
	.addReg(FalseReg, 0, SubIdx)
	.addReg(TrueReg, 0, SubIdx);
	preserveCondRegFlags(Select->getOperand(3), Cond[1]);

	MIB.addReg(DstElt)
	.addImm(SubIdx);
	}
	}

	bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
	switch (MI.getOpcode()) {
	case AMDGPU::V_MOV_B32_e32:
	case AMDGPU::V_MOV_B32_e64:
	case AMDGPU::V_MOV_B64_PSEUDO: {
	// If there are additional implicit register operands, this may be used for
	// register indexing so the source register operand isn't simply copied.
	unsigned NumOps = MI.getDesc().getNumOperands() +
	MI.getDesc().getNumImplicitUses();

	return MI.getNumOperands() == NumOps;
	}
	case AMDGPU::S_MOV_B32:
	case AMDGPU::S_MOV_B64:
	case AMDGPU::COPY:
	return true;
	default:
	return false;
	}
	}

	unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
	PseudoSourceValue::PSVKind Kind) const {
	switch(Kind) {
	case PseudoSourceValue::Stack:
	case PseudoSourceValue::FixedStack:
	return AMDGPUASI.PRIVATE_ADDRESS;
	case PseudoSourceValue::ConstantPool:
	case PseudoSourceValue::GOT:
	case PseudoSourceValue::JumpTable:
	case PseudoSourceValue::GlobalValueCallEntry:
	case PseudoSourceValue::ExternalSymbolCallEntry:
	case PseudoSourceValue::TargetCustom:
	return AMDGPUASI.CONSTANT_ADDRESS;
	}
	return AMDGPUASI.FLAT_ADDRESS;
	}

	static void removeModOperands(MachineInstr &MI) {
	unsigned Opc = MI.getOpcode();
	int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
	AMDGPU::OpName::src0_modifiers);
	int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
	AMDGPU::OpName::src1_modifiers);
	int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
	AMDGPU::OpName::src2_modifiers);

	MI.RemoveOperand(Src2ModIdx);
	MI.RemoveOperand(Src1ModIdx);
	MI.RemoveOperand(Src0ModIdx);
	}

	bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
	unsigned Reg, MachineRegisterInfo *MRI) const {
	if (!MRI->hasOneNonDBGUse(Reg))
	return false;

	switch (DefMI.getOpcode()) {
	default:
	return false;
	case AMDGPU::S_MOV_B64:
	// TODO: We could fold 64-bit immediates, but this get compilicated
	// when there are sub-registers.
	return false;

	case AMDGPU::V_MOV_B32_e32:
	case AMDGPU::S_MOV_B32:
	break;
	}

	const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
	assert(ImmOp);
	// FIXME: We could handle FrameIndex values here.
	if (!ImmOp->isImm())
	return false;

	unsigned Opc = UseMI.getOpcode();
	if (Opc == AMDGPU::COPY) {
	bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
	unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
	UseMI.setDesc(get(NewOpc));
	UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
	UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
	return true;
	}

	if (Opc == AMDGPU::V_MAD_F32 \|\| Opc == AMDGPU::V_MAC_F32_e64 \|\|
	Opc == AMDGPU::V_MAD_F16 \|\| Opc == AMDGPU::V_MAC_F16_e64) {
	// Don't fold if we are using source or output modifiers. The new VOP2
	// instructions don't have them.
	if (hasAnyModifiersSet(UseMI))
	return false;

	// If this is a free constant, there's no reason to do this.
	// TODO: We could fold this here instead of letting SIFoldOperands do it
	// later.
	MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);

	// Any src operand can be used for the legality check.
	if (isInlineConstant(UseMI, Src0, ImmOp))
	return false;

	bool IsF32 = Opc == AMDGPU::V_MAD_F32 \|\| Opc == AMDGPU::V_MAC_F32_e64;
	MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
	MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);

	// Multiplied part is the constant: Use v_madmk_{f16, f32}.
	// We should only expect these to be on src0 due to canonicalizations.
	if (Src0->isReg() && Src0->getReg() == Reg) {
	if (!Src1->isReg() \|\| RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
	return false;

	if (!Src2->isReg() \|\| RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
	return false;

	// We need to swap operands 0 and 1 since madmk constant is at operand 1.

	const int64_t Imm = ImmOp->getImm();

	// FIXME: This would be a lot easier if we could return a new instruction
	// instead of having to modify in place.

	// Remove these first since they are at the end.
	UseMI.RemoveOperand(
	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
	UseMI.RemoveOperand(
	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));

	unsigned Src1Reg = Src1->getReg();
	unsigned Src1SubReg = Src1->getSubReg();
	Src0->setReg(Src1Reg);
	Src0->setSubReg(Src1SubReg);
	Src0->setIsKill(Src1->isKill());

	if (Opc == AMDGPU::V_MAC_F32_e64 \|\|
	Opc == AMDGPU::V_MAC_F16_e64)
	UseMI.untieRegOperand(
	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));

	Src1->ChangeToImmediate(Imm);

	removeModOperands(UseMI);
	UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));

	bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
	if (DeleteDef)
	DefMI.eraseFromParent();

	return true;
	}

	// Added part is the constant: Use v_madak_{f16, f32}.
	if (Src2->isReg() && Src2->getReg() == Reg) {
	// Not allowed to use constant bus for another operand.
	// We can however allow an inline immediate as src0.
	if (!Src0->isImm() &&
	(Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
	return false;

	if (!Src1->isReg() \|\| RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
	return false;

	const int64_t Imm = ImmOp->getImm();

	// FIXME: This would be a lot easier if we could return a new instruction
	// instead of having to modify in place.

	// Remove these first since they are at the end.
	UseMI.RemoveOperand(
	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
	UseMI.RemoveOperand(
	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));

	if (Opc == AMDGPU::V_MAC_F32_e64 \|\|
	Opc == AMDGPU::V_MAC_F16_e64)
	UseMI.untieRegOperand(
	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));

	// ChangingToImmediate adds Src2 back to the instruction.
	Src2->ChangeToImmediate(Imm);

	// These come before src2.
	removeModOperands(UseMI);
	UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));

	bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
	if (DeleteDef)
	DefMI.eraseFromParent();

	return true;
	}
	}

	return false;
	}

	static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
	int WidthB, int OffsetB) {
	int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
	int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
	int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
	return LowOffset + LowWidth <= HighOffset;
	}

	bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
	MachineInstr &MIb) const {
	unsigned BaseReg0, BaseReg1;
	int64_t Offset0, Offset1;

	if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
	getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {

	if (!MIa.hasOneMemOperand() \|\| !MIb.hasOneMemOperand()) {
	// FIXME: Handle ds_read2 / ds_write2.
	return false;
	}
	unsigned Width0 = (*MIa.memoperands_begin())->getSize();
	unsigned Width1 = (*MIb.memoperands_begin())->getSize();
	if (BaseReg0 == BaseReg1 &&
	offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
	return true;
	}
	}

	return false;
	}

	bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
	MachineInstr &MIb,
	AliasAnalysis *AA) const {
	assert((MIa.mayLoad() \|\| MIa.mayStore()) &&
	"MIa must load from or modify a memory location");
	assert((MIb.mayLoad() \|\| MIb.mayStore()) &&
	"MIb must load from or modify a memory location");

	if (MIa.hasUnmodeledSideEffects() \|\| MIb.hasUnmodeledSideEffects())
	return false;

	// XXX - Can we relax this between address spaces?
	if (MIa.hasOrderedMemoryRef() \|\| MIb.hasOrderedMemoryRef())
	return false;

	if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
	const MachineMemOperand MMOa = MIa.memoperands_begin();
	const MachineMemOperand MMOb = MIb.memoperands_begin();
	if (MMOa->getValue() && MMOb->getValue()) {
	MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
	MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
	if (!AA->alias(LocA, LocB))
	return true;
	}
	}

	// TODO: Should we check the address space from the MachineMemOperand? That
	// would allow us to distinguish objects we know don't alias based on the
	// underlying address space, even if it was lowered to a different one,
	// e.g. private accesses lowered to use MUBUF instructions on a scratch
	// buffer.
	if (isDS(MIa)) {
	if (isDS(MIb))
	return checkInstOffsetsDoNotOverlap(MIa, MIb);

	return !isFLAT(MIb) \|\| isSegmentSpecificFLAT(MIb);
	}

	if (isMUBUF(MIa) \|\| isMTBUF(MIa)) {
	if (isMUBUF(MIb) \|\| isMTBUF(MIb))
	return checkInstOffsetsDoNotOverlap(MIa, MIb);

	return !isFLAT(MIb) && !isSMRD(MIb);
	}

	if (isSMRD(MIa)) {
	if (isSMRD(MIb))
	return checkInstOffsetsDoNotOverlap(MIa, MIb);

	return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
	}

	if (isFLAT(MIa)) {
	if (isFLAT(MIb))
	return checkInstOffsetsDoNotOverlap(MIa, MIb);

	return false;
	}

	return false;
	}

	static int64_t getFoldableImm(const MachineOperand* MO) {
	if (!MO->isReg())
	return false;
	const MachineFunction *MF = MO->getParent()->getParent()->getParent();
	const MachineRegisterInfo &MRI = MF->getRegInfo();
	auto Def = MRI.getUniqueVRegDef(MO->getReg());
	if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
	Def->getOperand(1).isImm())
	return Def->getOperand(1).getImm();
	return AMDGPU::NoRegister;
	}

	MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
	MachineInstr &MI,
	LiveVariables *LV) const {
	bool IsF16 = false;

	switch (MI.getOpcode()) {
	default:
	return nullptr;
	case AMDGPU::V_MAC_F16_e64:
	IsF16 = true;
	LLVM_FALLTHROUGH;
	case AMDGPU::V_MAC_F32_e64:
	break;
	case AMDGPU::V_MAC_F16_e32:
	IsF16 = true;
	LLVM_FALLTHROUGH;
	case AMDGPU::V_MAC_F32_e32: {
	int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
	AMDGPU::OpName::src0);
	const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
	if (!Src0->isReg() && !Src0->isImm())
	return nullptr;

	if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
	return nullptr;

	break;
	}
	}

	const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
	const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
	const MachineOperand *Src0Mods =
	getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
	const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
	const MachineOperand *Src1Mods =
	getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
	const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
	const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
	const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);

	if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
	// If we have an SGPR input, we will violate the constant bus restriction.
	(!Src0->isReg() \|\| !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
	if (auto Imm = getFoldableImm(Src2)) {
	return BuildMI(*MBB, MI, MI.getDebugLoc(),
	get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
	.add(*Dst)
	.add(*Src0)
	.add(*Src1)
	.addImm(Imm);
	}
	if (auto Imm = getFoldableImm(Src1)) {
	return BuildMI(*MBB, MI, MI.getDebugLoc(),
	get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
	.add(*Dst)
	.add(*Src0)
	.addImm(Imm)
	.add(*Src2);
	}
	if (auto Imm = getFoldableImm(Src0)) {
	if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
	AMDGPU::OpName::src0), Src1))
	return BuildMI(*MBB, MI, MI.getDebugLoc(),
	get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
	.add(*Dst)
	.add(*Src1)
	.addImm(Imm)
	.add(*Src2);
	}
	}

	return BuildMI(*MBB, MI, MI.getDebugLoc(),
	get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
	.add(*Dst)
	.addImm(Src0Mods ? Src0Mods->getImm() : 0)
	.add(*Src0)
	.addImm(Src1Mods ? Src1Mods->getImm() : 0)
	.add(*Src1)
	.addImm(0) // Src mods
	.add(*Src2)
	.addImm(Clamp ? Clamp->getImm() : 0)
	.addImm(Omod ? Omod->getImm() : 0);
	}

	// It's not generally safe to move VALU instructions across these since it will
	// start using the register as a base index rather than directly.
	// XXX - Why isn't hasSideEffects sufficient for these?
	static bool changesVGPRIndexingMode(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case AMDGPU::S_SET_GPR_IDX_ON:
	case AMDGPU::S_SET_GPR_IDX_MODE:
	case AMDGPU::S_SET_GPR_IDX_OFF:
	return true;
	default:
	return false;
	}
	}

	bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
	const MachineBasicBlock *MBB,
	const MachineFunction &MF) const {
	// XXX - Do we want the SP check in the base implementation?

	// Target-independent instructions do not have an implicit-use of EXEC, even
	// when they operate on VGPRs. Treating EXEC modifications as scheduling
	// boundaries prevents incorrect movements of such instructions.
	return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) \|\|
	MI.modifiesRegister(AMDGPU::EXEC, &RI) \|\|
	MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 \|\|
	MI.getOpcode() == AMDGPU::S_SETREG_B32 \|\|
	changesVGPRIndexingMode(MI);
	}

	bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
	switch (Imm.getBitWidth()) {
	case 32:
	return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
	ST.hasInv2PiInlineImm());
	case 64:
	return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
	ST.hasInv2PiInlineImm());
	case 16:
	return ST.has16BitInsts() &&
	AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
	ST.hasInv2PiInlineImm());
	default:
	llvm_unreachable("invalid bitwidth");
	}
	}

	bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
	uint8_t OperandType) const {
	if (!MO.isImm() \|\|
	OperandType < AMDGPU::OPERAND_SRC_FIRST \|\|
	OperandType > AMDGPU::OPERAND_SRC_LAST)
	return false;

	// MachineOperand provides no way to tell the true operand size, since it only
	// records a 64-bit value. We need to know the size to determine if a 32-bit
	// floating point immediate bit pattern is legal for an integer immediate. It
	// would be for any 32-bit integer operand, but would not be for a 64-bit one.

	int64_t Imm = MO.getImm();
	switch (OperandType) {
	case AMDGPU::OPERAND_REG_IMM_INT32:
	case AMDGPU::OPERAND_REG_IMM_FP32:
	case AMDGPU::OPERAND_REG_INLINE_C_INT32:
	case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
	int32_t Trunc = static_cast<int32_t>(Imm);
	return Trunc == Imm &&
	AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
	}
	case AMDGPU::OPERAND_REG_IMM_INT64:
	case AMDGPU::OPERAND_REG_IMM_FP64:
	case AMDGPU::OPERAND_REG_INLINE_C_INT64:
	case AMDGPU::OPERAND_REG_INLINE_C_FP64:
	return AMDGPU::isInlinableLiteral64(MO.getImm(),
	ST.hasInv2PiInlineImm());
	case AMDGPU::OPERAND_REG_IMM_INT16:
	case AMDGPU::OPERAND_REG_IMM_FP16:
	case AMDGPU::OPERAND_REG_INLINE_C_INT16:
	case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
	if (isInt<16>(Imm) \|\| isUInt<16>(Imm)) {
	// A few special case instructions have 16-bit operands on subtargets
	// where 16-bit instructions are not legal.
	// TODO: Do the 32-bit immediates work? We shouldn't really need to handle
	// constants in these cases
	int16_t Trunc = static_cast<int16_t>(Imm);
	return ST.has16BitInsts() &&
	AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
	}

	return false;
	}
	case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
	case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
	uint32_t Trunc = static_cast<uint32_t>(Imm);
	return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
	}
	default:
	llvm_unreachable("invalid bitwidth");
	}
	}

	bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
	const MCOperandInfo &OpInfo) const {
	switch (MO.getType()) {
	case MachineOperand::MO_Register:
	return false;
	case MachineOperand::MO_Immediate:
	return !isInlineConstant(MO, OpInfo);
	case MachineOperand::MO_FrameIndex:
	case MachineOperand::MO_MachineBasicBlock:
	case MachineOperand::MO_ExternalSymbol:
	case MachineOperand::MO_GlobalAddress:
	case MachineOperand::MO_MCSymbol:
	return true;
	default:
	llvm_unreachable("unexpected operand type");
	}
	}

	static bool compareMachineOp(const MachineOperand &Op0,
	const MachineOperand &Op1) {
	if (Op0.getType() != Op1.getType())
	return false;

	switch (Op0.getType()) {
	case MachineOperand::MO_Register:
	return Op0.getReg() == Op1.getReg();
	case MachineOperand::MO_Immediate:
	return Op0.getImm() == Op1.getImm();
	default:
	llvm_unreachable("Didn't expect to be comparing these operand types");
	}
	}

	bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
	const MachineOperand &MO) const {
	const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];

	assert(MO.isImm() \|\| MO.isTargetIndex() \|\| MO.isFI());

	if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
	return true;

	if (OpInfo.RegClass < 0)
	return false;

	if (MO.isImm() && isInlineConstant(MO, OpInfo))
	return RI.opCanUseInlineConstant(OpInfo.OperandType);

	return RI.opCanUseLiteralConstant(OpInfo.OperandType);
	}

	bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
	int Op32 = AMDGPU::getVOPe32(Opcode);
	if (Op32 == -1)
	return false;

	return pseudoToMCOpcode(Op32) != -1;
	}

	bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
	// The src0_modifier operand is present on all instructions
	// that have modifiers.

	return AMDGPU::getNamedOperandIdx(Opcode,
	AMDGPU::OpName::src0_modifiers) != -1;
	}

	bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
	unsigned OpName) const {
	const MachineOperand *Mods = getNamedOperand(MI, OpName);
	return Mods && Mods->getImm();
	}

	bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
	return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) \|\|
	hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) \|\|
	hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) \|\|
	hasModifiersSet(MI, AMDGPU::OpName::clamp) \|\|
	hasModifiersSet(MI, AMDGPU::OpName::omod);
	}

	bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
	const MachineOperand &MO,
	const MCOperandInfo &OpInfo) const {
	// Literal constants use the constant bus.
	//if (isLiteralConstantLike(MO, OpInfo))
	// return true;
	if (MO.isImm())
	return !isInlineConstant(MO, OpInfo);

	if (!MO.isReg())
	return true; // Misc other operands like FrameIndex

	if (!MO.isUse())
	return false;

	if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
	return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));

	// FLAT_SCR is just an SGPR pair.
	if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
	return true;

	// EXEC register uses the constant bus.
	if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
	return true;

	// SGPRs use the constant bus
	return (MO.getReg() == AMDGPU::VCC \|\| MO.getReg() == AMDGPU::M0 \|\|
	(!MO.isImplicit() &&
	(AMDGPU::SGPR_32RegClass.contains(MO.getReg()) \|\|
	AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
	}

	static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
	for (const MachineOperand &MO : MI.implicit_operands()) {
	// We only care about reads.
	if (MO.isDef())
	continue;

	switch (MO.getReg()) {
	case AMDGPU::VCC:
	case AMDGPU::M0:
	case AMDGPU::FLAT_SCR:
	return MO.getReg();

	default:
	break;
	}
	}

	return AMDGPU::NoRegister;
	}

	static bool shouldReadExec(const MachineInstr &MI) {
	if (SIInstrInfo::isVALU(MI)) {
	switch (MI.getOpcode()) {
	case AMDGPU::V_READLANE_B32:
	case AMDGPU::V_READLANE_B32_si:
	case AMDGPU::V_READLANE_B32_vi:
	case AMDGPU::V_WRITELANE_B32:
	case AMDGPU::V_WRITELANE_B32_si:
	case AMDGPU::V_WRITELANE_B32_vi:
	return false;
	}

	return true;
	}

	if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) \|\|
	SIInstrInfo::isSALU(MI) \|\|
	SIInstrInfo::isSMRD(MI))
	return false;

	return true;
	}

	static bool isSubRegOf(const SIRegisterInfo &TRI,
	const MachineOperand &SuperVec,
	const MachineOperand &SubReg) {
	if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
	return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());

	return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
	SubReg.getReg() == SuperVec.getReg();
	}

	bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
	StringRef &ErrInfo) const {
	uint16_t Opcode = MI.getOpcode();
	if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
	return true;

	const MachineFunction *MF = MI.getParent()->getParent();
	const MachineRegisterInfo &MRI = MF->getRegInfo();

	int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
	int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
	int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);

	// Make sure the number of operands is correct.
	const MCInstrDesc &Desc = get(Opcode);
	if (!Desc.isVariadic() &&
	Desc.getNumOperands() != MI.getNumExplicitOperands()) {
	ErrInfo = "Instruction has wrong number of operands.";
	return false;
	}

	if (MI.isInlineAsm()) {
	// Verify register classes for inlineasm constraints.
	for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
	I != E; ++I) {
	const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
	if (!RC)
	continue;

	const MachineOperand &Op = MI.getOperand(I);
	if (!Op.isReg())
	continue;

	unsigned Reg = Op.getReg();
	if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
	ErrInfo = "inlineasm operand has incorrect register class.";
	return false;
	}
	}

	return true;
	}

	// Make sure the register classes are correct.
	for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
	if (MI.getOperand(i).isFPImm()) {
	ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
	"all fp values to integers.";
	return false;
	}

	int RegClass = Desc.OpInfo[i].RegClass;

	switch (Desc.OpInfo[i].OperandType) {
	case MCOI::OPERAND_REGISTER:
	if (MI.getOperand(i).isImm()) {
	ErrInfo = "Illegal immediate value for operand.";
	return false;
	}
	break;
	case AMDGPU::OPERAND_REG_IMM_INT32:
	case AMDGPU::OPERAND_REG_IMM_FP32:
	break;
	case AMDGPU::OPERAND_REG_INLINE_C_INT32:
	case AMDGPU::OPERAND_REG_INLINE_C_FP32:
	case AMDGPU::OPERAND_REG_INLINE_C_INT64:
	case AMDGPU::OPERAND_REG_INLINE_C_FP64:
	case AMDGPU::OPERAND_REG_INLINE_C_INT16:
	case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
	const MachineOperand &MO = MI.getOperand(i);
	if (!MO.isReg() && (!MO.isImm() \|\| !isInlineConstant(MI, i))) {
	ErrInfo = "Illegal immediate value for operand.";
	return false;
	}
	break;
	}
	case MCOI::OPERAND_IMMEDIATE:
	case AMDGPU::OPERAND_KIMM32:
	// Check if this operand is an immediate.
	// FrameIndex operands will be replaced by immediates, so they are
	// allowed.
	if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
	ErrInfo = "Expected immediate, but got non-immediate";
	return false;
	}
	LLVM_FALLTHROUGH;
	default:
	continue;
	}

	if (!MI.getOperand(i).isReg())
	continue;

	if (RegClass != -1) {
	unsigned Reg = MI.getOperand(i).getReg();
	if (Reg == AMDGPU::NoRegister \|\|
	TargetRegisterInfo::isVirtualRegister(Reg))
	continue;

	const TargetRegisterClass *RC = RI.getRegClass(RegClass);
	if (!RC->contains(Reg)) {
	ErrInfo = "Operand has incorrect register class.";
	return false;
	}
	}
	}

	// Verify SDWA
	if (isSDWA(MI)) {
	if (!ST.hasSDWA()) {
	ErrInfo = "SDWA is not supported on this target";
	return false;
	}

	int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);

	const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };

	for (int OpIdx: OpIndicies) {
	if (OpIdx == -1)
	continue;
	const MachineOperand &MO = MI.getOperand(OpIdx);

	if (!ST.hasSDWAScalar()) {
	// Only VGPRS on VI
	if (!MO.isReg() \|\| !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
	ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
	return false;
	}
	} else {
	// No immediates on GFX9
	if (!MO.isReg()) {
	ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
	return false;
	}
	}
	}

	if (!ST.hasSDWAOmod()) {
	// No omod allowed on VI
	const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
	if (OMod != nullptr &&
	(!OMod->isImm() \|\| OMod->getImm() != 0)) {
	ErrInfo = "OMod not allowed in SDWA instructions on VI";
	return false;
	}
	}

	uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
	if (isVOPC(BasicOpcode)) {
	if (!ST.hasSDWASdst() && DstIdx != -1) {
	// Only vcc allowed as dst on VI for VOPC
	const MachineOperand &Dst = MI.getOperand(DstIdx);
	if (!Dst.isReg() \|\| Dst.getReg() != AMDGPU::VCC) {
	ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
	return false;
	}
	} else if (!ST.hasSDWAOutModsVOPC()) {
	// No clamp allowed on GFX9 for VOPC
	const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
	if (Clamp && (!Clamp->isImm() \|\| Clamp->getImm() != 0)) {
	ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
	return false;
	}

	// No omod allowed on GFX9 for VOPC
	const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
	if (OMod && (!OMod->isImm() \|\| OMod->getImm() != 0)) {
	ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
	return false;
	}
	}
	}

	const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
	if (DstUnused && DstUnused->isImm() &&
	DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
	const MachineOperand &Dst = MI.getOperand(DstIdx);
	if (!Dst.isReg() \|\| !Dst.isTied()) {
	ErrInfo = "Dst register should have tied register";
	return false;
	}

	const MachineOperand &TiedMO =
	MI.getOperand(MI.findTiedOperandIdx(DstIdx));
	if (!TiedMO.isReg() \|\| !TiedMO.isImplicit() \|\| !TiedMO.isUse()) {
	ErrInfo =
	"Dst register should be tied to implicit use of preserved register";
	return false;
	} else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
	Dst.getReg() != TiedMO.getReg()) {
	ErrInfo = "Dst register should use same physical register as preserved";
	return false;
	}
	}
	}

	// Verify VOP*
	if (isVOP1(MI) \|\| isVOP2(MI) \|\| isVOP3(MI) \|\| isVOPC(MI) \|\| isSDWA(MI)) {
	// Only look at the true operands. Only a real operand can use the constant
	// bus, and we don't want to check pseudo-operands like the source modifier
	// flags.
	const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };

	unsigned ConstantBusCount = 0;

	if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
	++ConstantBusCount;

	unsigned SGPRUsed = findImplicitSGPRRead(MI);
	if (SGPRUsed != AMDGPU::NoRegister)
	++ConstantBusCount;

	for (int OpIdx : OpIndices) {
	if (OpIdx == -1)
	break;
	const MachineOperand &MO = MI.getOperand(OpIdx);
	if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
	if (MO.isReg()) {
	if (MO.getReg() != SGPRUsed)
	++ConstantBusCount;
	SGPRUsed = MO.getReg();
	} else {
	++ConstantBusCount;
	}
	}
	}
	if (ConstantBusCount > 1) {
	ErrInfo = "VOP* instruction uses the constant bus more than once";
	return false;
	}
	}

	// Verify misc. restrictions on specific instructions.
	if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 \|\|
	Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
	const MachineOperand &Src0 = MI.getOperand(Src0Idx);
	const MachineOperand &Src1 = MI.getOperand(Src1Idx);
	const MachineOperand &Src2 = MI.getOperand(Src2Idx);
	if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
	if (!compareMachineOp(Src0, Src1) &&
	!compareMachineOp(Src0, Src2)) {
	ErrInfo = "v_div_scale_{f32\|f64} require src0 = src1 or src2";
	return false;
	}
	}
	}

	if (isSOPK(MI)) {
	int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
	if (sopkIsZext(MI)) {
	if (!isUInt<16>(Imm)) {
	ErrInfo = "invalid immediate for SOPK instruction";
	return false;
	}
	} else {
	if (!isInt<16>(Imm)) {
	ErrInfo = "invalid immediate for SOPK instruction";
	return false;
	}
	}
	}

	if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 \|\|
	Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 \|\|
	Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 \|\|
	Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
	const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 \|\|
	Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;

	const unsigned StaticNumOps = Desc.getNumOperands() +
	Desc.getNumImplicitUses();
	const unsigned NumImplicitOps = IsDst ? 2 : 1;

	// Allow additional implicit operands. This allows a fixup done by the post
	// RA scheduler where the main implicit operand is killed and implicit-defs
	// are added for sub-registers that remain live after this instruction.
	if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
	ErrInfo = "missing implicit register operands";
	return false;
	}

	const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
	if (IsDst) {
	if (!Dst->isUse()) {
	ErrInfo = "v_movreld_b32 vdst should be a use operand";
	return false;
	}

	unsigned UseOpIdx;
	if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) \|\|
	UseOpIdx != StaticNumOps + 1) {
	ErrInfo = "movrel implicit operands should be tied";
	return false;
	}
	}

	const MachineOperand &Src0 = MI.getOperand(Src0Idx);
	const MachineOperand &ImpUse
	= MI.getOperand(StaticNumOps + NumImplicitOps - 1);
	if (!ImpUse.isReg() \|\| !ImpUse.isUse() \|\|
	!isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
	ErrInfo = "src0 should be subreg of implicit vector use";
	return false;
	}
	}

	// Make sure we aren't losing exec uses in the td files. This mostly requires
	// being careful when using let Uses to try to add other use registers.
	if (shouldReadExec(MI)) {
	if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
	ErrInfo = "VALU instruction does not implicitly read exec mask";
	return false;
	}
	}

	if (isSMRD(MI)) {
	if (MI.mayStore()) {
	// The register offset form of scalar stores may only use m0 as the
	// soffset register.
	const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
	if (Soff && Soff->getReg() != AMDGPU::M0) {
	ErrInfo = "scalar stores must use m0 as offset register";
	return false;
	}
	}
	}

	if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) {
	const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
	if (Offset->getImm() != 0) {
	ErrInfo = "subtarget does not support offsets in flat instructions";
	return false;
	}
	}

	return true;
	}

	unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
	switch (MI.getOpcode()) {
	default: return AMDGPU::INSTRUCTION_LIST_END;
	case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
	case AMDGPU::COPY: return AMDGPU::COPY;
	case AMDGPU::PHI: return AMDGPU::PHI;
	case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
	case AMDGPU::WQM: return AMDGPU::WQM;
	case AMDGPU::WWM: return AMDGPU::WWM;
	case AMDGPU::S_MOV_B32:
	return MI.getOperand(1).isReg() ?
	AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
	case AMDGPU::S_ADD_I32:
	return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
	case AMDGPU::S_ADDC_U32:
	return AMDGPU::V_ADDC_U32_e32;
	case AMDGPU::S_SUB_I32:
	return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
	// FIXME: These are not consistently handled, and selected when the carry is
	// used.
	case AMDGPU::S_ADD_U32:
	return AMDGPU::V_ADD_I32_e32;
	case AMDGPU::S_SUB_U32:
	return AMDGPU::V_SUB_I32_e32;
	case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
	case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
	case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
	case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
	case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
	case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
	case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
	case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
	case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
	case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
	case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
	case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
	case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
	case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
	case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
	case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
	case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
	case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
	case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
	case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
	case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
	case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
	case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
	case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
	case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
	case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
	case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
	case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
	case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
	case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
	case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
	case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
	case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
	case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
	case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
	case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
	case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
	case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
	case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
	case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
	case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
	case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
	case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
	}
	}

	const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
	unsigned OpNo) const {
	const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
	const MCInstrDesc &Desc = get(MI.getOpcode());
	if (MI.isVariadic() \|\| OpNo >= Desc.getNumOperands() \|\|
	Desc.OpInfo[OpNo].RegClass == -1) {
	unsigned Reg = MI.getOperand(OpNo).getReg();

	if (TargetRegisterInfo::isVirtualRegister(Reg))
	return MRI.getRegClass(Reg);
	return RI.getPhysRegClass(Reg);
	}

	unsigned RCID = Desc.OpInfo[OpNo].RegClass;
	return RI.getRegClass(RCID);
	}

	bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
	switch (MI.getOpcode()) {
	case AMDGPU::COPY:
	case AMDGPU::REG_SEQUENCE:
	case AMDGPU::PHI:
	case AMDGPU::INSERT_SUBREG:
	return RI.hasVGPRs(getOpRegClass(MI, 0));
	default:
	return RI.hasVGPRs(getOpRegClass(MI, OpNo));
	}
	}

	void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
	MachineBasicBlock::iterator I = MI;
	MachineBasicBlock *MBB = MI.getParent();
	MachineOperand &MO = MI.getOperand(OpIdx);
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
	const TargetRegisterClass *RC = RI.getRegClass(RCID);
	unsigned Opcode = AMDGPU::V_MOV_B32_e32;
	if (MO.isReg())
	Opcode = AMDGPU::COPY;
	else if (RI.isSGPRClass(RC))
	Opcode = AMDGPU::S_MOV_B32;

	const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
	if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
	VRC = &AMDGPU::VReg_64RegClass;
	else
	VRC = &AMDGPU::VGPR_32RegClass;

	unsigned Reg = MRI.createVirtualRegister(VRC);
	DebugLoc DL = MBB->findDebugLoc(I);
	BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
	MO.ChangeToRegister(Reg, false);
	}

	unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
	MachineRegisterInfo &MRI,
	MachineOperand &SuperReg,
	const TargetRegisterClass *SuperRC,
	unsigned SubIdx,
	const TargetRegisterClass *SubRC)
	const {
	MachineBasicBlock *MBB = MI->getParent();
	DebugLoc DL = MI->getDebugLoc();
	unsigned SubReg = MRI.createVirtualRegister(SubRC);

	if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
	BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
	.addReg(SuperReg.getReg(), 0, SubIdx);
	return SubReg;
	}

	// Just in case the super register is itself a sub-register, copy it to a new
	// value so we don't need to worry about merging its subreg index with the
	// SubIdx passed to this function. The register coalescer should be able to
	// eliminate this extra copy.
	unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);

	BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
	.addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());

	BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
	.addReg(NewSuperReg, 0, SubIdx);

	return SubReg;
	}

	MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
	MachineBasicBlock::iterator MII,
	MachineRegisterInfo &MRI,
	MachineOperand &Op,
	const TargetRegisterClass *SuperRC,
	unsigned SubIdx,
	const TargetRegisterClass *SubRC) const {
	if (Op.isImm()) {
	if (SubIdx == AMDGPU::sub0)
	return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
	if (SubIdx == AMDGPU::sub1)
	return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));

	llvm_unreachable("Unhandled register index for immediate");
	}

	unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
	SubIdx, SubRC);
	return MachineOperand::CreateReg(SubReg, false);
	}

	// Change the order of operands from (0, 1, 2) to (0, 2, 1)
	void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
	assert(Inst.getNumExplicitOperands() == 3);
	MachineOperand Op1 = Inst.getOperand(1);
	Inst.RemoveOperand(1);
	Inst.addOperand(Op1);
	}

	bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
	const MCOperandInfo &OpInfo,
	const MachineOperand &MO) const {
	if (!MO.isReg())
	return false;

	unsigned Reg = MO.getReg();
	const TargetRegisterClass *RC =
	TargetRegisterInfo::isVirtualRegister(Reg) ?
	MRI.getRegClass(Reg) :
	RI.getPhysRegClass(Reg);

	const SIRegisterInfo *TRI =
	static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
	RC = TRI->getSubRegClass(RC, MO.getSubReg());

	// In order to be legal, the common sub-class must be equal to the
	// class of the current operand. For example:
	//
	// v_mov_b32 s0 ; Operand defined as vsrc_b32
	// ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
	//
	// s_sendmsg 0, s0 ; Operand defined as m0reg
	// ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL

	return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
	}

	bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
	const MCOperandInfo &OpInfo,
	const MachineOperand &MO) const {
	if (MO.isReg())
	return isLegalRegOperand(MRI, OpInfo, MO);

	// Handle non-register types that are treated like immediates.
	assert(MO.isImm() \|\| MO.isTargetIndex() \|\| MO.isFI());
	return true;
	}

	bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
	const MachineOperand *MO) const {
	const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
	const MCInstrDesc &InstDesc = MI.getDesc();
	const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
	const TargetRegisterClass *DefinedRC =
	OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
	if (!MO)
	MO = &MI.getOperand(OpIdx);

	if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {

	RegSubRegPair SGPRUsed;
	if (MO->isReg())
	SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());

	for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
	if (i == OpIdx)
	continue;
	const MachineOperand &Op = MI.getOperand(i);
	if (Op.isReg()) {
	if ((Op.getReg() != SGPRUsed.Reg \|\| Op.getSubReg() != SGPRUsed.SubReg) &&
	usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
	return false;
	}
	} else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
	return false;
	}
	}
	}

	if (MO->isReg()) {
	assert(DefinedRC);
	return isLegalRegOperand(MRI, OpInfo, *MO);
	}

	// Handle non-register types that are treated like immediates.
	assert(MO->isImm() \|\| MO->isTargetIndex() \|\| MO->isFI());

	if (!DefinedRC) {
	// This operand expects an immediate.
	return true;
	}

	return isImmOperandLegal(MI, OpIdx, *MO);
	}

	void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
	MachineInstr &MI) const {
	unsigned Opc = MI.getOpcode();
	const MCInstrDesc &InstrDesc = get(Opc);

	int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
	MachineOperand &Src1 = MI.getOperand(Src1Idx);

	// If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
	// we need to only have one constant bus use.
	//
	// Note we do not need to worry about literal constants here. They are
	// disabled for the operand type for instructions because they will always
	// violate the one constant bus use rule.
	bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
	if (HasImplicitSGPR) {
	int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
	MachineOperand &Src0 = MI.getOperand(Src0Idx);

	if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
	legalizeOpWithMove(MI, Src0Idx);
	}

	// VOP2 src0 instructions support all operand types, so we don't need to check
	// their legality. If src1 is already legal, we don't need to do anything.
	if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
	return;

	// Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
	// lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
	// select is uniform.
	if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
	RI.isVGPR(MRI, Src1.getReg())) {
	unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
	const DebugLoc &DL = MI.getDebugLoc();
	BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
	.add(Src1);
	Src1.ChangeToRegister(Reg, false);
	return;
	}

	// We do not use commuteInstruction here because it is too aggressive and will
	// commute if it is possible. We only want to commute here if it improves
	// legality. This can be called a fairly large number of times so don't waste
	// compile time pointlessly swapping and checking legality again.
	if (HasImplicitSGPR \|\| !MI.isCommutable()) {
	legalizeOpWithMove(MI, Src1Idx);
	return;
	}

	int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
	MachineOperand &Src0 = MI.getOperand(Src0Idx);

	// If src0 can be used as src1, commuting will make the operands legal.
	// Otherwise we have to give up and insert a move.
	//
	// TODO: Other immediate-like operand kinds could be commuted if there was a
	// MachineOperand::ChangeTo* for them.
	if ((!Src1.isImm() && !Src1.isReg()) \|\|
	!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
	legalizeOpWithMove(MI, Src1Idx);
	return;
	}

	int CommutedOpc = commuteOpcode(MI);
	if (CommutedOpc == -1) {
	legalizeOpWithMove(MI, Src1Idx);
	return;
	}

	MI.setDesc(get(CommutedOpc));

	unsigned Src0Reg = Src0.getReg();
	unsigned Src0SubReg = Src0.getSubReg();
	bool Src0Kill = Src0.isKill();

	if (Src1.isImm())
	Src0.ChangeToImmediate(Src1.getImm());
	else if (Src1.isReg()) {
	Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
	Src0.setSubReg(Src1.getSubReg());
	} else
	llvm_unreachable("Should only have register or immediate operands");

	Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
	Src1.setSubReg(Src0SubReg);
	}

	// Legalize VOP3 operands. Because all operand types are supported for any
	// operand, and since literal constants are not allowed and should never be
	// seen, we only need to worry about inserting copies if we use multiple SGPR
	// operands.
	void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
	MachineInstr &MI) const {
	unsigned Opc = MI.getOpcode();

	int VOP3Idx[3] = {
	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
	};

	// Find the one SGPR operand we are allowed to use.
	unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);

	for (unsigned i = 0; i < 3; ++i) {
	int Idx = VOP3Idx[i];
	if (Idx == -1)
	break;
	MachineOperand &MO = MI.getOperand(Idx);

	// We should never see a VOP3 instruction with an illegal immediate operand.
	if (!MO.isReg())
	continue;

	if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
	continue; // VGPRs are legal

	if (SGPRReg == AMDGPU::NoRegister \|\| SGPRReg == MO.getReg()) {
	SGPRReg = MO.getReg();
	// We can use one SGPR in each VOP3 instruction.
	continue;
	}

	// If we make it this far, then the operand is not legal and we must
	// legalize it.
	legalizeOpWithMove(MI, Idx);
	}
	}

	unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
	MachineRegisterInfo &MRI) const {
	const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
	const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
	unsigned DstReg = MRI.createVirtualRegister(SRC);
	unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;

	SmallVector<unsigned, 8> SRegs;
	for (unsigned i = 0; i < SubRegs; ++i) {
	unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
	BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
	get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
	.addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
	SRegs.push_back(SGPR);
	}

	MachineInstrBuilder MIB =
	BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
	get(AMDGPU::REG_SEQUENCE), DstReg);
	for (unsigned i = 0; i < SubRegs; ++i) {
	MIB.addReg(SRegs[i]);
	MIB.addImm(RI.getSubRegFromChannel(i));
	}
	return DstReg;
	}

	void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
	MachineInstr &MI) const {

	// If the pointer is store in VGPRs, then we need to move them to
	// SGPRs using v_readfirstlane. This is safe because we only select
	// loads with uniform pointers to SMRD instruction so we know the
	// pointer value is uniform.
	MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
	if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
	unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
	SBase->setReg(SGPR);
	}
	}

	void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
	MachineBasicBlock::iterator I,
	const TargetRegisterClass *DstRC,
	MachineOperand &Op,
	MachineRegisterInfo &MRI,
	const DebugLoc &DL) const {
	unsigned OpReg = Op.getReg();
	unsigned OpSubReg = Op.getSubReg();

	const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
	RI.getRegClassForReg(MRI, OpReg), OpSubReg);

	// Check if operand is already the correct register class.
	if (DstRC == OpRC)
	return;

	unsigned DstReg = MRI.createVirtualRegister(DstRC);
	MachineInstr *Copy =
	BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);

	Op.setReg(DstReg);
	Op.setSubReg(0);

	MachineInstr *Def = MRI.getVRegDef(OpReg);
	if (!Def)
	return;

	// Try to eliminate the copy if it is copying an immediate value.
	if (Def->isMoveImmediate())
	FoldImmediate(Copy, Def, OpReg, &MRI);
	}

	void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
	MachineFunction &MF = *MI.getParent()->getParent();
	MachineRegisterInfo &MRI = MF.getRegInfo();

	// Legalize VOP2
	if (isVOP2(MI) \|\| isVOPC(MI)) {
	legalizeOperandsVOP2(MRI, MI);
	return;
	}

	// Legalize VOP3
	if (isVOP3(MI)) {
	legalizeOperandsVOP3(MRI, MI);
	return;
	}

	// Legalize SMRD
	if (isSMRD(MI)) {
	legalizeOperandsSMRD(MRI, MI);
	return;
	}

	// Legalize REG_SEQUENCE and PHI
	// The register class of the operands much be the same type as the register
	// class of the output.
	if (MI.getOpcode() == AMDGPU::PHI) {
	const TargetRegisterClass RC = nullptr, SRC = nullptr, *VRC = nullptr;
	for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
	if (!MI.getOperand(i).isReg() \|\|
	!TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
	continue;
	const TargetRegisterClass *OpRC =
	MRI.getRegClass(MI.getOperand(i).getReg());
	if (RI.hasVGPRs(OpRC)) {
	VRC = OpRC;
	} else {
	SRC = OpRC;
	}
	}

	// If any of the operands are VGPR registers, then they all most be
	// otherwise we will create illegal VGPR->SGPR copies when legalizing
	// them.
	if (VRC \|\| !RI.isSGPRClass(getOpRegClass(MI, 0))) {
	if (!VRC) {
	assert(SRC);
	VRC = RI.getEquivalentVGPRClass(SRC);
	}
	RC = VRC;
	} else {
	RC = SRC;
	}

	// Update all the operands so they have the same type.
	for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
	MachineOperand &Op = MI.getOperand(I);
	if (!Op.isReg() \|\| !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
	continue;

	// MI is a PHI instruction.
	MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
	MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();

	// Avoid creating no-op copies with the same src and dst reg class. These
	// confuse some of the machine passes.
	legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
	}
	}

	// REG_SEQUENCE doesn't really require operand legalization, but if one has a
	// VGPR dest type and SGPR sources, insert copies so all operands are
	// VGPRs. This seems to help operand folding / the register coalescer.
	if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
	MachineBasicBlock *MBB = MI.getParent();
	const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
	if (RI.hasVGPRs(DstRC)) {
	// Update all the operands so they are VGPR register classes. These may
	// not be the same register class because REG_SEQUENCE supports mixing
	// subregister index types e.g. sub0_sub1 + sub2 + sub3
	for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
	MachineOperand &Op = MI.getOperand(I);
	if (!Op.isReg() \|\| !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
	continue;

	const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
	const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
	if (VRC == OpRC)
	continue;

	legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
	Op.setIsKill();
	}
	}

	return;
	}

	// Legalize INSERT_SUBREG
	// src0 must have the same register class as dst
	if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
	unsigned Dst = MI.getOperand(0).getReg();
	unsigned Src0 = MI.getOperand(1).getReg();
	const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
	const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
	if (DstRC != Src0RC) {
	MachineBasicBlock *MBB = MI.getParent();
	MachineOperand &Op = MI.getOperand(1);
	legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
	}
	return;
	}

	// Legalize MIMG and MUBUF/MTBUF for shaders.
	//
	// Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
	// scratch memory access. In both cases, the legalization never involves
	// conversion to the addr64 form.
	if (isMIMG(MI) \|\|
	(AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
	(isMUBUF(MI) \|\| isMTBUF(MI)))) {
	MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
	if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
	unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
	SRsrc->setReg(SGPR);
	}

	MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
	if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
	unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
	SSamp->setReg(SGPR);
	}
	return;
	}

	// Legalize MUBUF* instructions by converting to addr64 form.
	// FIXME: If we start using the non-addr64 instructions for compute, we
	// may need to legalize them as above. This especially applies to the
	// buffer_load_format_* variants and variants with idxen (or bothen).
	int SRsrcIdx =
	AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
	if (SRsrcIdx != -1) {
	// We have an MUBUF instruction
	MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
	unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
	if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
	RI.getRegClass(SRsrcRC))) {
	// The operands are legal.
	// FIXME: We may need to legalize operands besided srsrc.
	return;
	}

	MachineBasicBlock &MBB = *MI.getParent();

	// Extract the ptr from the resource descriptor.
	unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
	&AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);

	// Create an empty resource descriptor
	unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
	unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
	unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
	unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
	uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();

	// Zero64 = 0
	BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
	.addImm(0);

	// SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
	BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
	.addImm(RsrcDataFormat & 0xFFFFFFFF);

	// SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
	BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
	.addImm(RsrcDataFormat >> 32);

	// NewSRsrc = {Zero64, SRsrcFormat}
	BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
	.addReg(Zero64)
	.addImm(AMDGPU::sub0_sub1)
	.addReg(SRsrcFormatLo)
	.addImm(AMDGPU::sub2)
	.addReg(SRsrcFormatHi)
	.addImm(AMDGPU::sub3);

	MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
	unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
	if (VAddr) {
	// This is already an ADDR64 instruction so we need to add the pointer
	// extracted from the resource descriptor to the current value of VAddr.
	unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

	// NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
	DebugLoc DL = MI.getDebugLoc();
	BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
	.addReg(SRsrcPtr, 0, AMDGPU::sub0)
	.addReg(VAddr->getReg(), 0, AMDGPU::sub0);

	// NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
	BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
	.addReg(SRsrcPtr, 0, AMDGPU::sub1)
	.addReg(VAddr->getReg(), 0, AMDGPU::sub1);

	// NewVaddr = {NewVaddrHi, NewVaddrLo}
	BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
	.addReg(NewVAddrLo)
	.addImm(AMDGPU::sub0)
	.addReg(NewVAddrHi)
	.addImm(AMDGPU::sub1);
	} else {
	// This instructions is the _OFFSET variant, so we need to convert it to
	// ADDR64.
	assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
	< SISubtarget::VOLCANIC_ISLANDS &&
	"FIXME: Need to emit flat atomics here");

	MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
	MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
	MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
	unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());

	// Atomics rith return have have an additional tied operand and are
	// missing some of the special bits.
	MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
	MachineInstr *Addr64;

	if (!VDataIn) {
	// Regular buffer load / store.
	MachineInstrBuilder MIB =
	BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
	.add(*VData)
	.addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
	// This will be replaced later
	// with the new value of vaddr.
	.add(*SRsrc)
	.add(*SOffset)
	.add(*Offset);

	// Atomics do not have this operand.
	if (const MachineOperand *GLC =
	getNamedOperand(MI, AMDGPU::OpName::glc)) {
	MIB.addImm(GLC->getImm());
	}

	MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));

	if (const MachineOperand *TFE =
	getNamedOperand(MI, AMDGPU::OpName::tfe)) {
	MIB.addImm(TFE->getImm());
	}

	MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	Addr64 = MIB;
	} else {
	// Atomics with return.
	Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
	.add(*VData)
	.add(*VDataIn)
	.addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
	// This will be replaced later
	// with the new value of vaddr.
	.add(*SRsrc)
	.add(*SOffset)
	.add(*Offset)
	.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
	.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	}

	MI.removeFromParent();

	// NewVaddr = {NewVaddrHi, NewVaddrLo}
	BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
	NewVAddr)
	.addReg(SRsrcPtr, 0, AMDGPU::sub0)
	.addImm(AMDGPU::sub0)
	.addReg(SRsrcPtr, 0, AMDGPU::sub1)
	.addImm(AMDGPU::sub1);

	VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
	SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
	}

	// Update the instruction to use NewVaddr
	VAddr->setReg(NewVAddr);
	// Update the instruction to use NewSRsrc
	SRsrc->setReg(NewSRsrc);
	}
	}

	void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
	SetVectorType Worklist;
	Worklist.insert(&TopInst);

	while (!Worklist.empty()) {
	MachineInstr &Inst = *Worklist.pop_back_val();
	MachineBasicBlock *MBB = Inst.getParent();
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

	unsigned Opcode = Inst.getOpcode();
	unsigned NewOpcode = getVALUOp(Inst);

	// Handle some special cases
	switch (Opcode) {
	default:
	break;
	case AMDGPU::S_ADD_U64_PSEUDO:
	case AMDGPU::S_SUB_U64_PSEUDO:
	splitScalar64BitAddSub(Worklist, Inst);
	Inst.eraseFromParent();
	continue;
	case AMDGPU::S_ADD_I32:
	case AMDGPU::S_SUB_I32:
	// FIXME: The u32 versions currently selected use the carry.
	if (moveScalarAddSub(Worklist, Inst))
	continue;

	// Default handling
	break;
	case AMDGPU::S_AND_B64:
	splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
	Inst.eraseFromParent();
	continue;

	case AMDGPU::S_OR_B64:
	splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
	Inst.eraseFromParent();
	continue;

	case AMDGPU::S_XOR_B64:
	splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
	Inst.eraseFromParent();
	continue;

	case AMDGPU::S_NOT_B64:
	splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
	Inst.eraseFromParent();
	continue;

	case AMDGPU::S_BCNT1_I32_B64:
	splitScalar64BitBCNT(Worklist, Inst);
	Inst.eraseFromParent();
	continue;

	case AMDGPU::S_BFE_I64:
	splitScalar64BitBFE(Worklist, Inst);
	Inst.eraseFromParent();
	continue;

	case AMDGPU::S_LSHL_B32:
	if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
	NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
	swapOperands(Inst);
	}
	break;
	case AMDGPU::S_ASHR_I32:
	if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
	NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
	swapOperands(Inst);
	}
	break;
	case AMDGPU::S_LSHR_B32:
	if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
	NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
	swapOperands(Inst);
	}
	break;
	case AMDGPU::S_LSHL_B64:
	if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
	NewOpcode = AMDGPU::V_LSHLREV_B64;
	swapOperands(Inst);
	}
	break;
	case AMDGPU::S_ASHR_I64:
	if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
	NewOpcode = AMDGPU::V_ASHRREV_I64;
	swapOperands(Inst);
	}
	break;
	case AMDGPU::S_LSHR_B64:
	if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
	NewOpcode = AMDGPU::V_LSHRREV_B64;
	swapOperands(Inst);
	}
	break;

	case AMDGPU::S_ABS_I32:
	lowerScalarAbs(Worklist, Inst);
	Inst.eraseFromParent();
	continue;

	case AMDGPU::S_CBRANCH_SCC0:
	case AMDGPU::S_CBRANCH_SCC1:
	// Clear unused bits of vcc
	BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
	AMDGPU::VCC)
	.addReg(AMDGPU::EXEC)
	.addReg(AMDGPU::VCC);
	break;

	case AMDGPU::S_BFE_U64:
	case AMDGPU::S_BFM_B64:
	llvm_unreachable("Moving this op to VALU not implemented");

	case AMDGPU::S_PACK_LL_B32_B16:
	case AMDGPU::S_PACK_LH_B32_B16:
	case AMDGPU::S_PACK_HH_B32_B16:
	movePackToVALU(Worklist, MRI, Inst);
	Inst.eraseFromParent();
	continue;

	case AMDGPU::S_XNOR_B32:
	lowerScalarXnor(Worklist, Inst);
	Inst.eraseFromParent();
	continue;

	case AMDGPU::S_XNOR_B64:
	splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32);
	Inst.eraseFromParent();
	continue;

	case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: {
	unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
	auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
	unsigned Offset = 0;

	// FIXME: This isn't safe because the addressing mode doesn't work
	// correctly if vaddr is negative.
	//
	- // FIXME: Handle v_add_u32 and VOP3 form. Also don't rely on immediate
	- // being in src0.
	- //
	// FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
	//
	// See if we can extract an immediate offset by recognizing one of these:
	// V_ADD_I32_e32 dst, imm, src1
	// V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
	// V_ADD will be removed by "Remove dead machine instructions".
	- if (Add && Add->getOpcode() == AMDGPU::V_ADD_I32_e32) {
	- const MachineOperand *Src =
	- getNamedOperand(*Add, AMDGPU::OpName::src0);
	+ if (Add &&
	+ (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 \|\|
	+ Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
	+ static const unsigned SrcNames[2] = {
	+ AMDGPU::OpName::src0,
	+ AMDGPU::OpName::src1,
	+ };

	- if (Src->isReg()) {
	- auto Mov = MRI.getUniqueVRegDef(Src->getReg());
	- if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
	- Src = &Mov->getOperand(1);
	- }
	+ // Find a literal offset in one of source operands.
	+ for (int i = 0; i < 2; i++) {
	+ const MachineOperand *Src =
	+ getNamedOperand(*Add, SrcNames[i]);

	- if (Src) {
	- if (Src->isImm())
	- Offset = Src->getImm();
	- else if (Src->isCImm())
	- Offset = Src->getCImm()->getZExtValue();
	- }
	+ if (Src->isReg()) {
	+ auto Mov = MRI.getUniqueVRegDef(Src->getReg());
	+ if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
	+ Src = &Mov->getOperand(1);
	+ }

	- if (Offset && isLegalMUBUFImmOffset(Offset))
	- VAddr = getNamedOperand(*Add, AMDGPU::OpName::src1);
	- else
	+ if (Src) {
	+ if (Src->isImm())
	+ Offset = Src->getImm();
	+ else if (Src->isCImm())
	+ Offset = Src->getCImm()->getZExtValue();
	+ }
	+
	+ if (Offset && isLegalMUBUFImmOffset(Offset)) {
	+ VAddr = getNamedOperand(*Add, SrcNames[!i]);
	+ break;
	+ }
	+
	Offset = 0;
	+ }
	}

	BuildMI(*MBB, Inst, Inst.getDebugLoc(),
	get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
	.add(*VAddr) // vaddr
	.add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
	.addImm(0) // soffset
	.addImm(Offset) // offset
	.addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
	.addImm(0) // slc
	.addImm(0) // tfe
	.setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end());

	MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
	VDst);
	addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
	Inst.eraseFromParent();
	continue;
	}
	}

	if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
	// We cannot move this instruction to the VALU, so we should try to
	// legalize its operands instead.
	legalizeOperands(Inst);
	continue;
	}

	// Use the new VALU Opcode.
	const MCInstrDesc &NewDesc = get(NewOpcode);
	Inst.setDesc(NewDesc);

	// Remove any references to SCC. Vector instructions can't read from it, and
	// We're just about to add the implicit use / defs of VCC, and we don't want
	// both.
	for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
	MachineOperand &Op = Inst.getOperand(i);
	if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
	Inst.RemoveOperand(i);
	addSCCDefUsersToVALUWorklist(Inst, Worklist);
	}
	}

	if (Opcode == AMDGPU::S_SEXT_I32_I8 \|\| Opcode == AMDGPU::S_SEXT_I32_I16) {
	// We are converting these to a BFE, so we need to add the missing
	// operands for the size and offset.
	unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
	Inst.addOperand(MachineOperand::CreateImm(0));
	Inst.addOperand(MachineOperand::CreateImm(Size));

	} else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
	// The VALU version adds the second operand to the result, so insert an
	// extra 0 operand.
	Inst.addOperand(MachineOperand::CreateImm(0));
	}

	Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());

	if (Opcode == AMDGPU::S_BFE_I32 \|\| Opcode == AMDGPU::S_BFE_U32) {
	const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
	// If we need to move this to VGPRs, we need to unpack the second operand
	// back into the 2 separate ones for bit offset and width.
	assert(OffsetWidthOp.isImm() &&
	"Scalar BFE is only implemented for constant width and offset");
	uint32_t Imm = OffsetWidthOp.getImm();

	uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
	uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
	Inst.RemoveOperand(2); // Remove old immediate.
	Inst.addOperand(MachineOperand::CreateImm(Offset));
	Inst.addOperand(MachineOperand::CreateImm(BitWidth));
	}

	bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
	unsigned NewDstReg = AMDGPU::NoRegister;
	if (HasDst) {
	unsigned DstReg = Inst.getOperand(0).getReg();
	if (TargetRegisterInfo::isPhysicalRegister(DstReg))
	continue;

	// Update the destination register class.
	const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
	if (!NewDstRC)
	continue;

	if (Inst.isCopy() &&
	TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
	NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
	// Instead of creating a copy where src and dst are the same register
	// class, we just replace all uses of dst with src. These kinds of
	// copies interfere with the heuristics MachineSink uses to decide
	// whether or not to split a critical edge. Since the pass assumes
	// that copies will end up as machine instructions and not be
	// eliminated.
	addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
	MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
	MRI.clearKillFlags(Inst.getOperand(1).getReg());
	Inst.getOperand(0).setReg(DstReg);
	continue;
	}

	NewDstReg = MRI.createVirtualRegister(NewDstRC);
	MRI.replaceRegWith(DstReg, NewDstReg);
	}

	// Legalize the operands
	legalizeOperands(Inst);

	if (HasDst)
	addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
	}
	}

	// Add/sub require special handling to deal with carry outs.
	bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist,
	MachineInstr &Inst) const {
	if (ST.hasAddNoCarry()) {
	// Assume there is no user of scc since we don't select this in that case.
	// Since scc isn't used, it doesn't really matter if the i32 or u32 variant
	// is used.

	MachineBasicBlock &MBB = *Inst.getParent();
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

	unsigned OldDstReg = Inst.getOperand(0).getReg();
	unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

	unsigned Opc = Inst.getOpcode();
	assert(Opc == AMDGPU::S_ADD_I32 \|\| Opc == AMDGPU::S_SUB_I32);

	unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
	AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;

	assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
	Inst.RemoveOperand(3);

	Inst.setDesc(get(NewOpc));
	Inst.addImplicitDefUseOperands(*MBB.getParent());
	MRI.replaceRegWith(OldDstReg, ResultReg);
	legalizeOperands(Inst);

	addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
	return true;
	}

	return false;
	}

	void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
	MachineInstr &Inst) const {
	MachineBasicBlock &MBB = *Inst.getParent();
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	MachineBasicBlock::iterator MII = Inst;
	DebugLoc DL = Inst.getDebugLoc();

	MachineOperand &Dest = Inst.getOperand(0);
	MachineOperand &Src = Inst.getOperand(1);
	unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

	unsigned SubOp = ST.hasAddNoCarry() ?
	AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;

	BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
	.addImm(0)
	.addReg(Src.getReg());

	BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
	.addReg(Src.getReg())
	.addReg(TmpReg);

	MRI.replaceRegWith(Dest.getReg(), ResultReg);
	addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
	}

	void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
	MachineInstr &Inst) const {
	MachineBasicBlock &MBB = *Inst.getParent();
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	MachineBasicBlock::iterator MII = Inst;
	const DebugLoc &DL = Inst.getDebugLoc();

	MachineOperand &Dest = Inst.getOperand(0);
	MachineOperand &Src0 = Inst.getOperand(1);
	MachineOperand &Src1 = Inst.getOperand(2);

	legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
	legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);

	unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
	.add(Src0)
	.add(Src1);

	unsigned Not = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), Not)
	.addReg(Xor);

	MRI.replaceRegWith(Dest.getReg(), Not);
	addUsersToMoveToVALUWorklist(Not, MRI, Worklist);
	}

	void SIInstrInfo::splitScalar64BitUnaryOp(
	SetVectorType &Worklist, MachineInstr &Inst,
	unsigned Opcode) const {
	MachineBasicBlock &MBB = *Inst.getParent();
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

	MachineOperand &Dest = Inst.getOperand(0);
	MachineOperand &Src0 = Inst.getOperand(1);
	DebugLoc DL = Inst.getDebugLoc();

	MachineBasicBlock::iterator MII = Inst;

	const MCInstrDesc &InstDesc = get(Opcode);
	const TargetRegisterClass *Src0RC = Src0.isReg() ?
	MRI.getRegClass(Src0.getReg()) :
	&AMDGPU::SGPR_32RegClass;

	const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);

	MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
	AMDGPU::sub0, Src0SubRC);

	const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
	const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
	const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);

	unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
	BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);

	MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
	AMDGPU::sub1, Src0SubRC);

	unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
	BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);

	unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
	BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
	.addReg(DestSub0)
	.addImm(AMDGPU::sub0)
	.addReg(DestSub1)
	.addImm(AMDGPU::sub1);

	MRI.replaceRegWith(Dest.getReg(), FullDestReg);

	// We don't need to legalizeOperands here because for a single operand, src0
	// will support any kind of input.

	// Move all users of this moved value.
	addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
	}

	void SIInstrInfo::splitScalar64BitAddSub(
	SetVectorType &Worklist, MachineInstr &Inst) const {
	bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);

	MachineBasicBlock &MBB = *Inst.getParent();
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

	unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
	unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

	unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
	unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);

	MachineOperand &Dest = Inst.getOperand(0);
	MachineOperand &Src0 = Inst.getOperand(1);
	MachineOperand &Src1 = Inst.getOperand(2);
	const DebugLoc &DL = Inst.getDebugLoc();
	MachineBasicBlock::iterator MII = Inst;

	const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
	const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
	const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
	const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);

	MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
	AMDGPU::sub0, Src0SubRC);
	MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
	AMDGPU::sub0, Src1SubRC);


	MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
	AMDGPU::sub1, Src0SubRC);
	MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
	AMDGPU::sub1, Src1SubRC);

	unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
	MachineInstr *LoHalf =
	BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
	.addReg(CarryReg, RegState::Define)
	.add(SrcReg0Sub0)
	.add(SrcReg1Sub0);

	unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
	MachineInstr *HiHalf =
	BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
	.addReg(DeadCarryReg, RegState::Define \| RegState::Dead)
	.add(SrcReg0Sub1)
	.add(SrcReg1Sub1)
	.addReg(CarryReg, RegState::Kill);

	BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
	.addReg(DestSub0)
	.addImm(AMDGPU::sub0)
	.addReg(DestSub1)
	.addImm(AMDGPU::sub1);

	MRI.replaceRegWith(Dest.getReg(), FullDestReg);

	// Try to legalize the operands in case we need to swap the order to keep it
	// valid.
	legalizeOperands(*LoHalf);
	legalizeOperands(*HiHalf);

	// Move all users of this moved vlaue.
	addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
	}

	void SIInstrInfo::splitScalar64BitBinaryOp(
	SetVectorType &Worklist, MachineInstr &Inst,
	unsigned Opcode) const {
	MachineBasicBlock &MBB = *Inst.getParent();
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

	MachineOperand &Dest = Inst.getOperand(0);
	MachineOperand &Src0 = Inst.getOperand(1);
	MachineOperand &Src1 = Inst.getOperand(2);
	DebugLoc DL = Inst.getDebugLoc();

	MachineBasicBlock::iterator MII = Inst;

	const MCInstrDesc &InstDesc = get(Opcode);
	const TargetRegisterClass *Src0RC = Src0.isReg() ?
	MRI.getRegClass(Src0.getReg()) :
	&AMDGPU::SGPR_32RegClass;

	const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
	const TargetRegisterClass *Src1RC = Src1.isReg() ?
	MRI.getRegClass(Src1.getReg()) :
	&AMDGPU::SGPR_32RegClass;

	const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);

	MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
	AMDGPU::sub0, Src0SubRC);
	MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
	AMDGPU::sub0, Src1SubRC);

	const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
	const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
	const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);

	unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
	MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
	.add(SrcReg0Sub0)
	.add(SrcReg1Sub0);

	MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
	AMDGPU::sub1, Src0SubRC);
	MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
	AMDGPU::sub1, Src1SubRC);

	unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
	MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
	.add(SrcReg0Sub1)
	.add(SrcReg1Sub1);

	unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
	BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
	.addReg(DestSub0)
	.addImm(AMDGPU::sub0)
	.addReg(DestSub1)
	.addImm(AMDGPU::sub1);

	MRI.replaceRegWith(Dest.getReg(), FullDestReg);

	// Try to legalize the operands in case we need to swap the order to keep it
	// valid.
	legalizeOperands(LoHalf);
	legalizeOperands(HiHalf);

	// Move all users of this moved vlaue.
	addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
	}

	void SIInstrInfo::splitScalar64BitBCNT(
	SetVectorType &Worklist, MachineInstr &Inst) const {
	MachineBasicBlock &MBB = *Inst.getParent();
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

	MachineBasicBlock::iterator MII = Inst;
	DebugLoc DL = Inst.getDebugLoc();

	MachineOperand &Dest = Inst.getOperand(0);
	MachineOperand &Src = Inst.getOperand(1);

	const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
	const TargetRegisterClass *SrcRC = Src.isReg() ?
	MRI.getRegClass(Src.getReg()) :
	&AMDGPU::SGPR_32RegClass;

	unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

	const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);

	MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
	AMDGPU::sub0, SrcSubRC);
	MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
	AMDGPU::sub1, SrcSubRC);

	BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);

	BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);

	MRI.replaceRegWith(Dest.getReg(), ResultReg);

	// We don't need to legalize operands here. src0 for etiher instruction can be
	// an SGPR, and the second input is unused or determined here.
	addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
	}

	void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
	MachineInstr &Inst) const {
	MachineBasicBlock &MBB = *Inst.getParent();
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	MachineBasicBlock::iterator MII = Inst;
	DebugLoc DL = Inst.getDebugLoc();

	MachineOperand &Dest = Inst.getOperand(0);
	uint32_t Imm = Inst.getOperand(2).getImm();
	uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
	uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].

	(void) Offset;

	// Only sext_inreg cases handled.
	assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
	Offset == 0 && "Not implemented");

	if (BitWidth < 32) {
	unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);

	BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
	.addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
	.addImm(0)
	.addImm(BitWidth);

	BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
	.addImm(31)
	.addReg(MidRegLo);

	BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
	.addReg(MidRegLo)
	.addImm(AMDGPU::sub0)
	.addReg(MidRegHi)
	.addImm(AMDGPU::sub1);

	MRI.replaceRegWith(Dest.getReg(), ResultReg);
	addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
	return;
	}

	MachineOperand &Src = Inst.getOperand(1);
	unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);

	BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
	.addImm(31)
	.addReg(Src.getReg(), 0, AMDGPU::sub0);

	BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
	.addReg(Src.getReg(), 0, AMDGPU::sub0)
	.addImm(AMDGPU::sub0)
	.addReg(TmpReg)
	.addImm(AMDGPU::sub1);

	MRI.replaceRegWith(Dest.getReg(), ResultReg);
	addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
	}

	void SIInstrInfo::addUsersToMoveToVALUWorklist(
	unsigned DstReg,
	MachineRegisterInfo &MRI,
	SetVectorType &Worklist) const {
	for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
	E = MRI.use_end(); I != E;) {
	MachineInstr &UseMI = *I->getParent();
	if (!canReadVGPR(UseMI, I.getOperandNo())) {
	Worklist.insert(&UseMI);

	do {
	++I;
	} while (I != E && I->getParent() == &UseMI);
	} else {
	++I;
	}
	}
	}

	void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
	MachineRegisterInfo &MRI,
	MachineInstr &Inst) const {
	unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	MachineBasicBlock *MBB = Inst.getParent();
	MachineOperand &Src0 = Inst.getOperand(1);
	MachineOperand &Src1 = Inst.getOperand(2);
	const DebugLoc &DL = Inst.getDebugLoc();

	switch (Inst.getOpcode()) {
	case AMDGPU::S_PACK_LL_B32_B16: {
	unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

	// FIXME: Can do a lot better if we know the high bits of src0 or src1 are
	// 0.
	BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
	.addImm(0xffff);

	BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
	.addReg(ImmReg, RegState::Kill)
	.add(Src0);

	BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
	.add(Src1)
	.addImm(16)
	.addReg(TmpReg, RegState::Kill);
	break;
	}
	case AMDGPU::S_PACK_LH_B32_B16: {
	unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
	.addImm(0xffff);
	BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
	.addReg(ImmReg, RegState::Kill)
	.add(Src0)
	.add(Src1);
	break;
	}
	case AMDGPU::S_PACK_HH_B32_B16: {
	unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
	.addImm(16)
	.add(Src0);
	BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
	.addImm(0xffff0000);
	BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
	.add(Src1)
	.addReg(ImmReg, RegState::Kill)
	.addReg(TmpReg, RegState::Kill);
	break;
	}
	default:
	llvm_unreachable("unhandled s_pack_* instruction");
	}

	MachineOperand &Dest = Inst.getOperand(0);
	MRI.replaceRegWith(Dest.getReg(), ResultReg);
	addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
	}

	void SIInstrInfo::addSCCDefUsersToVALUWorklist(
	MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
	// This assumes that all the users of SCC are in the same block
	// as the SCC def.
	for (MachineInstr &MI :
	make_range(MachineBasicBlock::iterator(SCCDefInst),
	SCCDefInst.getParent()->end())) {
	// Exit if we find another SCC def.
	if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
	return;

	if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
	Worklist.insert(&MI);
	}
	}

	const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
	const MachineInstr &Inst) const {
	const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);

	switch (Inst.getOpcode()) {
	// For target instructions, getOpRegClass just returns the virtual register
	// class associated with the operand, so we need to find an equivalent VGPR
	// register class in order to move the instruction to the VALU.
	case AMDGPU::COPY:
	case AMDGPU::PHI:
	case AMDGPU::REG_SEQUENCE:
	case AMDGPU::INSERT_SUBREG:
	case AMDGPU::WQM:
	case AMDGPU::WWM:
	if (RI.hasVGPRs(NewDstRC))
	return nullptr;

	NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
	if (!NewDstRC)
	return nullptr;
	return NewDstRC;
	default:
	return NewDstRC;
	}
	}

	// Find the one SGPR operand we are allowed to use.
	unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
	int OpIndices[3]) const {
	const MCInstrDesc &Desc = MI.getDesc();

	// Find the one SGPR operand we are allowed to use.
	//
	// First we need to consider the instruction's operand requirements before
	// legalizing. Some operands are required to be SGPRs, such as implicit uses
	// of VCC, but we are still bound by the constant bus requirement to only use
	// one.
	//
	// If the operand's class is an SGPR, we can never move it.

	unsigned SGPRReg = findImplicitSGPRRead(MI);
	if (SGPRReg != AMDGPU::NoRegister)
	return SGPRReg;

	unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
	const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();

	for (unsigned i = 0; i < 3; ++i) {
	int Idx = OpIndices[i];
	if (Idx == -1)
	break;

	const MachineOperand &MO = MI.getOperand(Idx);
	if (!MO.isReg())
	continue;

	// Is this operand statically required to be an SGPR based on the operand
	// constraints?
	const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
	bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
	if (IsRequiredSGPR)
	return MO.getReg();

	// If this could be a VGPR or an SGPR, Check the dynamic register class.
	unsigned Reg = MO.getReg();
	const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
	if (RI.isSGPRClass(RegRC))
	UsedSGPRs[i] = Reg;
	}

	// We don't have a required SGPR operand, so we have a bit more freedom in
	// selecting operands to move.

	// Try to select the most used SGPR. If an SGPR is equal to one of the
	// others, we choose that.
	//
	// e.g.
	// V_FMA_F32 v0, s0, s0, s0 -> No moves
	// V_FMA_F32 v0, s0, s1, s0 -> Move s1

	// TODO: If some of the operands are 64-bit SGPRs and some 32, we should
	// prefer those.

	if (UsedSGPRs[0] != AMDGPU::NoRegister) {
	if (UsedSGPRs[0] == UsedSGPRs[1] \|\| UsedSGPRs[0] == UsedSGPRs[2])
	SGPRReg = UsedSGPRs[0];
	}

	if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
	if (UsedSGPRs[1] == UsedSGPRs[2])
	SGPRReg = UsedSGPRs[1];
	}

	return SGPRReg;
	}

	MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
	unsigned OperandName) const {
	int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
	if (Idx == -1)
	return nullptr;

	return &MI.getOperand(Idx);
	}

	uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
	uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
	if (ST.isAmdHsaOS()) {
	// Set ATC = 1. GFX9 doesn't have this bit.
	if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
	RsrcDataFormat \|= (1ULL << 56);

	// Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
	// BTW, it disables TC L2 and therefore decreases performance.
	if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
	RsrcDataFormat \|= (2ULL << 59);
	}

	return RsrcDataFormat;
	}

	uint64_t SIInstrInfo::getScratchRsrcWords23() const {
	uint64_t Rsrc23 = getDefaultRsrcDataFormat() \|
	AMDGPU::RSRC_TID_ENABLE \|
	0xffffffff; // Size;

	// GFX9 doesn't have ELEMENT_SIZE.
	if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
	uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
	Rsrc23 \|= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
	}

	// IndexStride = 64.
	Rsrc23 \|= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;

	// If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
	// Clear them unless we want a huge stride.
	if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
	Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;

	return Rsrc23;
	}

	bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
	unsigned Opc = MI.getOpcode();

	return isSMRD(Opc);
	}

	bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
	unsigned Opc = MI.getOpcode();

	return isMUBUF(Opc) \|\| isMTBUF(Opc) \|\| isMIMG(Opc);
	}

	unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
	int &FrameIndex) const {
	const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
	if (!Addr \|\| !Addr->isFI())
	return AMDGPU::NoRegister;

	assert(!MI.memoperands_empty() &&
	(*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);

	FrameIndex = Addr->getIndex();
	return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
	}

	unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
	int &FrameIndex) const {
	const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
	assert(Addr && Addr->isFI());
	FrameIndex = Addr->getIndex();
	return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
	}

	unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	if (!MI.mayLoad())
	return AMDGPU::NoRegister;

	if (isMUBUF(MI) \|\| isVGPRSpill(MI))
	return isStackAccess(MI, FrameIndex);

	if (isSGPRSpill(MI))
	return isSGPRStackAccess(MI, FrameIndex);

	return AMDGPU::NoRegister;
	}

	unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	if (!MI.mayStore())
	return AMDGPU::NoRegister;

	if (isMUBUF(MI) \|\| isVGPRSpill(MI))
	return isStackAccess(MI, FrameIndex);

	if (isSGPRSpill(MI))
	return isSGPRStackAccess(MI, FrameIndex);

	return AMDGPU::NoRegister;
	}

	unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
	unsigned Size = 0;
	MachineBasicBlock::const_instr_iterator I = MI.getIterator();
	MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
	while (++I != E && I->isInsideBundle()) {
	assert(!I->isBundle() && "No nested bundle!");
	Size += getInstSizeInBytes(*I);
	}

	return Size;
	}

	unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
	unsigned Opc = MI.getOpcode();
	const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
	unsigned DescSize = Desc.getSize();

	// If we have a definitive size, we can use it. Otherwise we need to inspect
	// the operands to know the size.
	//
	// FIXME: Instructions that have a base 32-bit encoding report their size as
	// 4, even though they are really 8 bytes if they have a literal operand.
	if (DescSize != 0 && DescSize != 4)
	return DescSize;

	// 4-byte instructions may have a 32-bit literal encoded after them. Check
	// operands that coud ever be literals.
	if (isVALU(MI) \|\| isSALU(MI)) {
	if (isFixedSize(MI))
	return DescSize;

	int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
	if (Src0Idx == -1)
	return 4; // No operands.

	if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
	return 8;

	int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
	if (Src1Idx == -1)
	return 4;

	if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
	return 8;

	return 4;
	}

	if (DescSize == 4)
	return 4;

	switch (Opc) {
	case TargetOpcode::IMPLICIT_DEF:
	case TargetOpcode::KILL:
	case TargetOpcode::DBG_VALUE:
	case TargetOpcode::EH_LABEL:
	return 0;
	case TargetOpcode::BUNDLE:
	return getInstBundleSize(MI);
	case TargetOpcode::INLINEASM: {
	const MachineFunction *MF = MI.getParent()->getParent();
	const char *AsmStr = MI.getOperand(0).getSymbolName();
	return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
	}
	default:
	llvm_unreachable("unable to find instruction size");
	}
	}

	bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
	if (!isFLAT(MI))
	return false;

	if (MI.memoperands_empty())
	return true;

	for (const MachineMemOperand *MMO : MI.memoperands()) {
	if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
	return true;
	}
	return false;
	}

	bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
	return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
	}

	void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
	MachineBasicBlock *IfEnd) const {
	MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
	assert(TI != IfEntry->end());

	MachineInstr Branch = &(TI);
	MachineFunction *MF = IfEntry->getParent();
	MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();

	if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
	unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
	MachineInstr *SIIF =
	BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
	.add(Branch->getOperand(0))
	.add(Branch->getOperand(1));
	MachineInstr *SIEND =
	BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
	.addReg(DstReg);

	IfEntry->erase(TI);
	IfEntry->insert(IfEntry->end(), SIIF);
	IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
	}
	}

	void SIInstrInfo::convertNonUniformLoopRegion(
	MachineBasicBlock LoopEntry, MachineBasicBlock LoopEnd) const {
	MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
	// We expect 2 terminators, one conditional and one unconditional.
	assert(TI != LoopEnd->end());

	MachineInstr Branch = &(TI);
	MachineFunction *MF = LoopEnd->getParent();
	MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();

	if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {

	unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
	unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
	MachineInstrBuilder HeaderPHIBuilder =
	BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
	for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
	E = LoopEntry->pred_end();
	PI != E; ++PI) {
	if (*PI == LoopEnd) {
	HeaderPHIBuilder.addReg(BackEdgeReg);
	} else {
	MachineBasicBlock PMBB = PI;
	unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
	materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
	ZeroReg, 0);
	HeaderPHIBuilder.addReg(ZeroReg);
	}
	HeaderPHIBuilder.addMBB(*PI);
	}
	MachineInstr *HeaderPhi = HeaderPHIBuilder;
	MachineInstr SIIFBREAK = BuildMI((MF), Branch->getDebugLoc(),
	get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
	.addReg(DstReg)
	.add(Branch->getOperand(0));
	MachineInstr *SILOOP =
	BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
	.addReg(BackEdgeReg)
	.addMBB(LoopEntry);

	LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
	LoopEnd->erase(TI);
	LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
	LoopEnd->insert(LoopEnd->end(), SILOOP);
	}
	}

	ArrayRef<std::pair<int, const char *>>
	SIInstrInfo::getSerializableTargetIndices() const {
	static const std::pair<int, const char *> TargetIndices[] = {
	{AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
	{AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
	{AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
	{AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
	{AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
	return makeArrayRef(TargetIndices);
	}

	/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
	/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
	ScheduleHazardRecognizer *
	SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
	const ScheduleDAG *DAG) const {
	return new GCNHazardRecognizer(DAG->MF);
	}

	/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
	/// pass.
	ScheduleHazardRecognizer *
	SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
	return new GCNHazardRecognizer(MF);
	}

	std::pair<unsigned, unsigned>
	SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
	return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
	}

	ArrayRef<std::pair<unsigned, const char *>>
	SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
	static const std::pair<unsigned, const char *> TargetFlags[] = {
	{ MO_GOTPCREL, "amdgpu-gotprel" },
	{ MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
	{ MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
	{ MO_REL32_LO, "amdgpu-rel32-lo" },
	{ MO_REL32_HI, "amdgpu-rel32-hi" }
	};

	return makeArrayRef(TargetFlags);
	}

	bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
	return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
	MI.modifiesRegister(AMDGPU::EXEC, &RI);
	}

	MachineInstrBuilder
	SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL,
	unsigned DestReg) const {
	if (ST.hasAddNoCarry())
	return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);

	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
	MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC);

	return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
	.addReg(UnusedCarry, RegState::Define \| RegState::Dead);
	}

	bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
	switch (Opcode) {
	case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
	case AMDGPU::SI_KILL_I1_TERMINATOR:
	return true;
	default:
	return false;
	}
	}

	const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
	switch (Opcode) {
	case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
	return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
	case AMDGPU::SI_KILL_I1_PSEUDO:
	return get(AMDGPU::SI_KILL_I1_TERMINATOR);
	default:
	llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
	}
	}
	Index: head/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp (revision 328817)
	@@ -1,143 +1,156 @@
	//===-- Thumb1InstrInfo.cpp - Thumb-1 Instruction Information -------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the Thumb-1 implementation of the TargetInstrInfo class.
	//
	//===----------------------------------------------------------------------===//

	#include "Thumb1InstrInfo.h"
	#include "ARMSubtarget.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/MC/MCInst.h"

	using namespace llvm;

	Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
	: ARMBaseInstrInfo(STI), RI() {}

	/// Return the noop instruction to use for a noop.
	void Thumb1InstrInfo::getNoop(MCInst &NopInst) const {
	NopInst.setOpcode(ARM::tMOVr);
	NopInst.addOperand(MCOperand::createReg(ARM::R8));
	NopInst.addOperand(MCOperand::createReg(ARM::R8));
	NopInst.addOperand(MCOperand::createImm(ARMCC::AL));
	NopInst.addOperand(MCOperand::createReg(0));
	}

	unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
	return 0;
	}

	void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL, unsigned DestReg,
	unsigned SrcReg, bool KillSrc) const {
	// Need to check the arch.
	MachineFunction &MF = *MBB.getParent();
	const ARMSubtarget &st = MF.getSubtarget<ARMSubtarget>();

	assert(ARM::GPRRegClass.contains(DestReg, SrcReg) &&
	"Thumb1 can only copy GPR registers");

	if (st.hasV6Ops() \|\| ARM::hGPRRegClass.contains(SrcReg)
	\|\| !ARM::tGPRRegClass.contains(DestReg))
	BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc))
	.add(predOps(ARMCC::AL));
	else {
	// FIXME: Can also use 'mov hi, $src; mov $dst, hi',
	// with hi as either r10 or r11.

	const TargetRegisterInfo *RegInfo = st.getRegisterInfo();
	if (MBB.computeRegisterLiveness(RegInfo, ARM::CPSR, I)
	== MachineBasicBlock::LQR_Dead) {
	BuildMI(MBB, I, DL, get(ARM::tMOVSr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc))
	->addRegisterDead(ARM::CPSR, RegInfo);
	return;
	}

	// 'MOV lo, lo' is unpredictable on < v6, so use the stack to do it
	BuildMI(MBB, I, DL, get(ARM::tPUSH))
	.add(predOps(ARMCC::AL))
	.addReg(SrcReg, getKillRegState(KillSrc));
	BuildMI(MBB, I, DL, get(ARM::tPOP))
	.add(predOps(ARMCC::AL))
	.addReg(DestReg, getDefRegState(true));
	}
	}

	void Thumb1InstrInfo::
	storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
	unsigned SrcReg, bool isKill, int FI,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	assert((RC == &ARM::tGPRRegClass \|\|
	(TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
	isARMLowRegister(SrcReg))) && "Unknown regclass!");

	if (RC == &ARM::tGPRRegClass \|\|
	(TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
	isARMLowRegister(SrcReg))) {
	DebugLoc DL;
	if (I != MBB.end()) DL = I->getDebugLoc();

	MachineFunction &MF = *MBB.getParent();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
	MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
	BuildMI(MBB, I, DL, get(ARM::tSTRspi))
	.addReg(SrcReg, getKillRegState(isKill))
	.addFrameIndex(FI)
	.addImm(0)
	.addMemOperand(MMO)
	.add(predOps(ARMCC::AL));
	}
	}

	void Thumb1InstrInfo::
	loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
	unsigned DestReg, int FI,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	assert((RC == &ARM::tGPRRegClass \|\|
	(TargetRegisterInfo::isPhysicalRegister(DestReg) &&
	isARMLowRegister(DestReg))) && "Unknown regclass!");

	if (RC == &ARM::tGPRRegClass \|\|
	(TargetRegisterInfo::isPhysicalRegister(DestReg) &&
	isARMLowRegister(DestReg))) {
	DebugLoc DL;
	if (I != MBB.end()) DL = I->getDebugLoc();

	MachineFunction &MF = *MBB.getParent();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
	MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
	BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg)
	.addFrameIndex(FI)
	.addImm(0)
	.addMemOperand(MMO)
	.add(predOps(ARMCC::AL));
	}
	}

	void Thumb1InstrInfo::expandLoadStackGuard(
	MachineBasicBlock::iterator MI) const {
	MachineFunction &MF = *MI->getParent()->getParent();
	const TargetMachine &TM = MF.getTarget();
	if (TM.isPositionIndependent())
	expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_pcrel, ARM::tLDRi);
	else
	expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::tLDRi);
	}
	+
	+bool Thumb1InstrInfo::canCopyGluedNodeDuringSchedule(SDNode *N) const {
	+ // In Thumb1 the scheduler may need to schedule a cross-copy between GPRS and CPSR
	+ // but this is not always possible there, so allow the Scheduler to clone tADCS and tSBCS
	+ // even if they have glue.
	+ // FIXME. Actually implement the cross-copy where it is possible (post v6)
	+ // because these copies entail more spilling.
	+ unsigned Opcode = N->getMachineOpcode();
	+ if (Opcode == ARM::tADCS \|\| Opcode == ARM::tSBCS)
	+ return true;
	+
	+ return false;
	+}
	Index: head/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
	===================================================================
	--- head/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h (revision 328816)
	+++ head/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h (revision 328817)
	@@ -1,61 +1,62 @@
	//===-- Thumb1InstrInfo.h - Thumb-1 Instruction Information ------ C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the Thumb-1 implementation of the TargetInstrInfo class.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_ARM_THUMB1INSTRINFO_H
	#define LLVM_LIB_TARGET_ARM_THUMB1INSTRINFO_H

	#include "ARMBaseInstrInfo.h"
	#include "ThumbRegisterInfo.h"

	namespace llvm {
	class ARMSubtarget;

	class Thumb1InstrInfo : public ARMBaseInstrInfo {
	ThumbRegisterInfo RI;
	public:
	explicit Thumb1InstrInfo(const ARMSubtarget &STI);

	/// Return the noop instruction to use for a noop.
	void getNoop(MCInst &NopInst) const override;

	// Return the non-pre/post incrementing version of 'Opc'. Return 0
	// if there is not such an opcode.
	unsigned getUnindexedOpcode(unsigned Opc) const override;

	/// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
	/// such, whenever a client has an instance of instruction info, it should
	/// always be able to get register info as well (through this method).
	///
	const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }

	void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
	const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
	bool KillSrc) const override;
	void storeRegToStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	unsigned SrcReg, bool isKill, int FrameIndex,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const override;

	void loadRegFromStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	unsigned DestReg, int FrameIndex,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const override;

	+ bool canCopyGluedNodeDuringSchedule(SDNode *N) const override;
	private:
	void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
	};
	}

	#endif
	Index: head/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp (revision 328817)
	@@ -1,4381 +1,4380 @@
	//===- MipsISelLowering.cpp - Mips DAG Lowering Implementation ------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that Mips uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "MipsISelLowering.h"
	#include "InstPrinter/MipsInstPrinter.h"
	#include "MCTargetDesc/MipsBaseInfo.h"
	#include "MCTargetDesc/MipsMCTargetDesc.h"
	#include "MipsCCState.h"
	#include "MipsInstrInfo.h"
	#include "MipsMachineFunction.h"
	#include "MipsRegisterInfo.h"
	#include "MipsSubtarget.h"
	#include "MipsTargetMachine.h"
	#include "MipsTargetObjectFile.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/FunctionLoweringInfo.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetFrameLowering.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <cassert>
	#include <cctype>
	#include <cstdint>
	#include <deque>
	#include <iterator>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "mips-lower"

	STATISTIC(NumTailCalls, "Number of tail calls");

	static cl::opt<bool>
	LargeGOT("mxgot", cl::Hidden,
	cl::desc("MIPS: Enable GOT larger than 64k."), cl::init(false));

	static cl::opt<bool>
	NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
	cl::desc("MIPS: Don't trap on integer division by zero."),
	cl::init(false));

	static const MCPhysReg Mips64DPRegs[8] = {
	Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
	Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
	};

	// If I is a shifted mask, set the size (Size) and the first bit of the
	// mask (Pos), and return true.
	// For example, if I is 0x003ff800, (Pos, Size) = (11, 11).
	static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
	if (!isShiftedMask_64(I))
	return false;

	Size = countPopulation(I);
	Pos = countTrailingZeros(I);
	return true;
	}

	// The MIPS MSA ABI passes vector arguments in the integer register set.
	// The number of integer registers used is dependant on the ABI used.
	MVT MipsTargetLowering::getRegisterTypeForCallingConv(MVT VT) const {
	if (VT.isVector() && Subtarget.hasMSA())
	return Subtarget.isABI_O32() ? MVT::i32 : MVT::i64;
	return MipsTargetLowering::getRegisterType(VT);
	}

	MVT MipsTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
	EVT VT) const {
	if (VT.isVector()) {
	if (Subtarget.isABI_O32()) {
	return MVT::i32;
	} else {
	return (VT.getSizeInBits() == 32) ? MVT::i32 : MVT::i64;
	}
	}
	return MipsTargetLowering::getRegisterType(Context, VT);
	}

	unsigned MipsTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
	EVT VT) const {
	if (VT.isVector())
	return std::max((VT.getSizeInBits() / (Subtarget.isABI_O32() ? 32 : 64)),
	1U);
	return MipsTargetLowering::getNumRegisters(Context, VT);
	}

	unsigned MipsTargetLowering::getVectorTypeBreakdownForCallingConv(
	LLVMContext &Context, EVT VT, EVT &IntermediateVT,
	unsigned &NumIntermediates, MVT &RegisterVT) const {
	// Break down vector types to either 2 i64s or 4 i32s.
	RegisterVT = getRegisterTypeForCallingConv(Context, VT) ;
	IntermediateVT = RegisterVT;
	NumIntermediates = VT.getSizeInBits() < RegisterVT.getSizeInBits()
	? VT.getVectorNumElements()
	: VT.getSizeInBits() / RegisterVT.getSizeInBits();

	return NumIntermediates;
	}

	SDValue MipsTargetLowering::getGlobalReg(SelectionDAG &DAG, EVT Ty) const {
	MipsFunctionInfo *FI = DAG.getMachineFunction().getInfo<MipsFunctionInfo>();
	return DAG.getRegister(FI->getGlobalBaseReg(), Ty);
	}

	SDValue MipsTargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
	}

	SDValue MipsTargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
	}

	SDValue MipsTargetLowering::getTargetNode(BlockAddressSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
	}

	SDValue MipsTargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
	}

	SDValue MipsTargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
	N->getOffset(), Flag);
	}

	const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((MipsISD::NodeType)Opcode) {
	case MipsISD::FIRST_NUMBER: break;
	case MipsISD::JmpLink: return "MipsISD::JmpLink";
	case MipsISD::TailCall: return "MipsISD::TailCall";
	case MipsISD::Highest: return "MipsISD::Highest";
	case MipsISD::Higher: return "MipsISD::Higher";
	case MipsISD::Hi: return "MipsISD::Hi";
	case MipsISD::Lo: return "MipsISD::Lo";
	case MipsISD::GotHi: return "MipsISD::GotHi";
	case MipsISD::GPRel: return "MipsISD::GPRel";
	case MipsISD::ThreadPointer: return "MipsISD::ThreadPointer";
	case MipsISD::Ret: return "MipsISD::Ret";
	case MipsISD::ERet: return "MipsISD::ERet";
	case MipsISD::EH_RETURN: return "MipsISD::EH_RETURN";
	case MipsISD::FPBrcond: return "MipsISD::FPBrcond";
	case MipsISD::FPCmp: return "MipsISD::FPCmp";
	case MipsISD::FSELECT: return "MipsISD::FSELECT";
	case MipsISD::MTC1_D64: return "MipsISD::MTC1_D64";
	case MipsISD::CMovFP_T: return "MipsISD::CMovFP_T";
	case MipsISD::CMovFP_F: return "MipsISD::CMovFP_F";
	case MipsISD::TruncIntFP: return "MipsISD::TruncIntFP";
	case MipsISD::MFHI: return "MipsISD::MFHI";
	case MipsISD::MFLO: return "MipsISD::MFLO";
	case MipsISD::MTLOHI: return "MipsISD::MTLOHI";
	case MipsISD::Mult: return "MipsISD::Mult";
	case MipsISD::Multu: return "MipsISD::Multu";
	case MipsISD::MAdd: return "MipsISD::MAdd";
	case MipsISD::MAddu: return "MipsISD::MAddu";
	case MipsISD::MSub: return "MipsISD::MSub";
	case MipsISD::MSubu: return "MipsISD::MSubu";
	case MipsISD::DivRem: return "MipsISD::DivRem";
	case MipsISD::DivRemU: return "MipsISD::DivRemU";
	case MipsISD::DivRem16: return "MipsISD::DivRem16";
	case MipsISD::DivRemU16: return "MipsISD::DivRemU16";
	case MipsISD::BuildPairF64: return "MipsISD::BuildPairF64";
	case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64";
	case MipsISD::Wrapper: return "MipsISD::Wrapper";
	case MipsISD::DynAlloc: return "MipsISD::DynAlloc";
	case MipsISD::Sync: return "MipsISD::Sync";
	case MipsISD::Ext: return "MipsISD::Ext";
	case MipsISD::Ins: return "MipsISD::Ins";
	case MipsISD::CIns: return "MipsISD::CIns";
	case MipsISD::LWL: return "MipsISD::LWL";
	case MipsISD::LWR: return "MipsISD::LWR";
	case MipsISD::SWL: return "MipsISD::SWL";
	case MipsISD::SWR: return "MipsISD::SWR";
	case MipsISD::LDL: return "MipsISD::LDL";
	case MipsISD::LDR: return "MipsISD::LDR";
	case MipsISD::SDL: return "MipsISD::SDL";
	case MipsISD::SDR: return "MipsISD::SDR";
	case MipsISD::EXTP: return "MipsISD::EXTP";
	case MipsISD::EXTPDP: return "MipsISD::EXTPDP";
	case MipsISD::EXTR_S_H: return "MipsISD::EXTR_S_H";
	case MipsISD::EXTR_W: return "MipsISD::EXTR_W";
	case MipsISD::EXTR_R_W: return "MipsISD::EXTR_R_W";
	case MipsISD::EXTR_RS_W: return "MipsISD::EXTR_RS_W";
	case MipsISD::SHILO: return "MipsISD::SHILO";
	case MipsISD::MTHLIP: return "MipsISD::MTHLIP";
	case MipsISD::MULSAQ_S_W_PH: return "MipsISD::MULSAQ_S_W_PH";
	case MipsISD::MAQ_S_W_PHL: return "MipsISD::MAQ_S_W_PHL";
	case MipsISD::MAQ_S_W_PHR: return "MipsISD::MAQ_S_W_PHR";
	case MipsISD::MAQ_SA_W_PHL: return "MipsISD::MAQ_SA_W_PHL";
	case MipsISD::MAQ_SA_W_PHR: return "MipsISD::MAQ_SA_W_PHR";
	case MipsISD::DPAU_H_QBL: return "MipsISD::DPAU_H_QBL";
	case MipsISD::DPAU_H_QBR: return "MipsISD::DPAU_H_QBR";
	case MipsISD::DPSU_H_QBL: return "MipsISD::DPSU_H_QBL";
	case MipsISD::DPSU_H_QBR: return "MipsISD::DPSU_H_QBR";
	case MipsISD::DPAQ_S_W_PH: return "MipsISD::DPAQ_S_W_PH";
	case MipsISD::DPSQ_S_W_PH: return "MipsISD::DPSQ_S_W_PH";
	case MipsISD::DPAQ_SA_L_W: return "MipsISD::DPAQ_SA_L_W";
	case MipsISD::DPSQ_SA_L_W: return "MipsISD::DPSQ_SA_L_W";
	case MipsISD::DPA_W_PH: return "MipsISD::DPA_W_PH";
	case MipsISD::DPS_W_PH: return "MipsISD::DPS_W_PH";
	case MipsISD::DPAQX_S_W_PH: return "MipsISD::DPAQX_S_W_PH";
	case MipsISD::DPAQX_SA_W_PH: return "MipsISD::DPAQX_SA_W_PH";
	case MipsISD::DPAX_W_PH: return "MipsISD::DPAX_W_PH";
	case MipsISD::DPSX_W_PH: return "MipsISD::DPSX_W_PH";
	case MipsISD::DPSQX_S_W_PH: return "MipsISD::DPSQX_S_W_PH";
	case MipsISD::DPSQX_SA_W_PH: return "MipsISD::DPSQX_SA_W_PH";
	case MipsISD::MULSA_W_PH: return "MipsISD::MULSA_W_PH";
	case MipsISD::MULT: return "MipsISD::MULT";
	case MipsISD::MULTU: return "MipsISD::MULTU";
	case MipsISD::MADD_DSP: return "MipsISD::MADD_DSP";
	case MipsISD::MADDU_DSP: return "MipsISD::MADDU_DSP";
	case MipsISD::MSUB_DSP: return "MipsISD::MSUB_DSP";
	case MipsISD::MSUBU_DSP: return "MipsISD::MSUBU_DSP";
	case MipsISD::SHLL_DSP: return "MipsISD::SHLL_DSP";
	case MipsISD::SHRA_DSP: return "MipsISD::SHRA_DSP";
	case MipsISD::SHRL_DSP: return "MipsISD::SHRL_DSP";
	case MipsISD::SETCC_DSP: return "MipsISD::SETCC_DSP";
	case MipsISD::SELECT_CC_DSP: return "MipsISD::SELECT_CC_DSP";
	case MipsISD::VALL_ZERO: return "MipsISD::VALL_ZERO";
	case MipsISD::VANY_ZERO: return "MipsISD::VANY_ZERO";
	case MipsISD::VALL_NONZERO: return "MipsISD::VALL_NONZERO";
	case MipsISD::VANY_NONZERO: return "MipsISD::VANY_NONZERO";
	case MipsISD::VCEQ: return "MipsISD::VCEQ";
	case MipsISD::VCLE_S: return "MipsISD::VCLE_S";
	case MipsISD::VCLE_U: return "MipsISD::VCLE_U";
	case MipsISD::VCLT_S: return "MipsISD::VCLT_S";
	case MipsISD::VCLT_U: return "MipsISD::VCLT_U";
	case MipsISD::VSMAX: return "MipsISD::VSMAX";
	case MipsISD::VSMIN: return "MipsISD::VSMIN";
	case MipsISD::VUMAX: return "MipsISD::VUMAX";
	case MipsISD::VUMIN: return "MipsISD::VUMIN";
	case MipsISD::VEXTRACT_SEXT_ELT: return "MipsISD::VEXTRACT_SEXT_ELT";
	case MipsISD::VEXTRACT_ZEXT_ELT: return "MipsISD::VEXTRACT_ZEXT_ELT";
	case MipsISD::VNOR: return "MipsISD::VNOR";
	case MipsISD::VSHF: return "MipsISD::VSHF";
	case MipsISD::SHF: return "MipsISD::SHF";
	case MipsISD::ILVEV: return "MipsISD::ILVEV";
	case MipsISD::ILVOD: return "MipsISD::ILVOD";
	case MipsISD::ILVL: return "MipsISD::ILVL";
	case MipsISD::ILVR: return "MipsISD::ILVR";
	case MipsISD::PCKEV: return "MipsISD::PCKEV";
	case MipsISD::PCKOD: return "MipsISD::PCKOD";
	case MipsISD::INSVE: return "MipsISD::INSVE";
	}
	return nullptr;
	}

	MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
	const MipsSubtarget &STI)
	: TargetLowering(TM), Subtarget(STI), ABI(TM.getABI()) {
	// Mips does not have i1 type, so use i32 for
	// setcc operations results (slt, sgt, ...).
	setBooleanContents(ZeroOrOneBooleanContent);
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
	// The cmp.cond.fmt instruction in MIPS32r6/MIPS64r6 uses 0 and -1 like MSA
	// does. Integer booleans still use 0 and 1.
	if (Subtarget.hasMips32r6())
	setBooleanContents(ZeroOrOneBooleanContent,
	ZeroOrNegativeOneBooleanContent);

	// Load extented operations for i1 types must be promoted
	for (MVT VT : MVT::integer_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
	}

	// MIPS doesn't have extending float->double load/store. Set LoadExtAction
	// for f32, f16
	for (MVT VT : MVT::fp_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
	}

	// Set LoadExtAction for f16 vectors to Expand
	for (MVT VT : MVT::fp_vector_valuetypes()) {
	MVT F16VT = MVT::getVectorVT(MVT::f16, VT.getVectorNumElements());
	if (F16VT.isValid())
	setLoadExtAction(ISD::EXTLOAD, VT, F16VT, Expand);
	}

	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);

	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// Used by legalize types to correctly generate the setcc result.
	// Without this, every float setcc comes with a AND/OR with the result,
	// we don't want this, since the fpcmp result goes to a flag register,
	// which is used implicitly by brcond and select operations.
	AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);

	// Mips Custom Operations
	setOperationAction(ISD::BR_JT, MVT::Other, Expand);
	setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
	setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
	setOperationAction(ISD::JumpTable, MVT::i32, Custom);
	setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
	setOperationAction(ISD::SELECT, MVT::f32, Custom);
	setOperationAction(ISD::SELECT, MVT::f64, Custom);
	setOperationAction(ISD::SELECT, MVT::i32, Custom);
	setOperationAction(ISD::SETCC, MVT::f32, Custom);
	setOperationAction(ISD::SETCC, MVT::f64, Custom);
	setOperationAction(ISD::BRCOND, MVT::Other, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

	if (Subtarget.isGP64bit()) {
	setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
	setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
	setOperationAction(ISD::JumpTable, MVT::i64, Custom);
	setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
	setOperationAction(ISD::SELECT, MVT::i64, Custom);
	setOperationAction(ISD::LOAD, MVT::i64, Custom);
	setOperationAction(ISD::STORE, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
	}

	if (!Subtarget.isGP64bit()) {
	setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
	}

	setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
	if (Subtarget.isGP64bit())
	setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);

	setOperationAction(ISD::SDIV, MVT::i32, Expand);
	setOperationAction(ISD::SREM, MVT::i32, Expand);
	setOperationAction(ISD::UDIV, MVT::i32, Expand);
	setOperationAction(ISD::UREM, MVT::i32, Expand);
	setOperationAction(ISD::SDIV, MVT::i64, Expand);
	setOperationAction(ISD::SREM, MVT::i64, Expand);
	setOperationAction(ISD::UDIV, MVT::i64, Expand);
	setOperationAction(ISD::UREM, MVT::i64, Expand);

	if (!(Subtarget.hasDSP() && Subtarget.hasMips32r2())) {
	setOperationAction(ISD::ADDC, MVT::i32, Expand);
	setOperationAction(ISD::ADDE, MVT::i32, Expand);
	}

	setOperationAction(ISD::ADDC, MVT::i64, Expand);
	setOperationAction(ISD::ADDE, MVT::i64, Expand);
	setOperationAction(ISD::SUBC, MVT::i32, Expand);
	setOperationAction(ISD::SUBE, MVT::i32, Expand);
	setOperationAction(ISD::SUBC, MVT::i64, Expand);
	setOperationAction(ISD::SUBE, MVT::i64, Expand);

	// Operations not directly supported by Mips.
	setOperationAction(ISD::BR_CC, MVT::f32, Expand);
	setOperationAction(ISD::BR_CC, MVT::f64, Expand);
	setOperationAction(ISD::BR_CC, MVT::i32, Expand);
	setOperationAction(ISD::BR_CC, MVT::i64, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
	if (Subtarget.hasCnMips()) {
	setOperationAction(ISD::CTPOP, MVT::i32, Legal);
	setOperationAction(ISD::CTPOP, MVT::i64, Legal);
	} else {
	setOperationAction(ISD::CTPOP, MVT::i32, Expand);
	setOperationAction(ISD::CTPOP, MVT::i64, Expand);
	}
	setOperationAction(ISD::CTTZ, MVT::i32, Expand);
	setOperationAction(ISD::CTTZ, MVT::i64, Expand);
	setOperationAction(ISD::ROTL, MVT::i32, Expand);
	setOperationAction(ISD::ROTL, MVT::i64, Expand);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);

	if (!Subtarget.hasMips32r2())
	setOperationAction(ISD::ROTR, MVT::i32, Expand);

	if (!Subtarget.hasMips64r2())
	setOperationAction(ISD::ROTR, MVT::i64, Expand);

	setOperationAction(ISD::FSIN, MVT::f32, Expand);
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FCOS, MVT::f32, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	setOperationAction(ISD::FPOW, MVT::f32, Expand);
	setOperationAction(ISD::FPOW, MVT::f64, Expand);
	setOperationAction(ISD::FLOG, MVT::f32, Expand);
	setOperationAction(ISD::FLOG2, MVT::f32, Expand);
	setOperationAction(ISD::FLOG10, MVT::f32, Expand);
	setOperationAction(ISD::FEXP, MVT::f32, Expand);
	setOperationAction(ISD::FMA, MVT::f32, Expand);
	setOperationAction(ISD::FMA, MVT::f64, Expand);
	setOperationAction(ISD::FREM, MVT::f32, Expand);
	setOperationAction(ISD::FREM, MVT::f64, Expand);

	// Lower f16 conversion operations into library calls
	setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);

	setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);

	setOperationAction(ISD::VASTART, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::Other, Custom);
	setOperationAction(ISD::VACOPY, MVT::Other, Expand);
	setOperationAction(ISD::VAEND, MVT::Other, Expand);

	// Use the default for now
	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	if (!Subtarget.isGP64bit()) {
	setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
	setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
	}

	if (!Subtarget.hasMips32r2()) {
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
	}

	// MIPS16 lacks MIPS32's clz and clo instructions.
	if (!Subtarget.hasMips32() \|\| Subtarget.inMips16Mode())
	setOperationAction(ISD::CTLZ, MVT::i32, Expand);
	if (!Subtarget.hasMips64())
	setOperationAction(ISD::CTLZ, MVT::i64, Expand);

	if (!Subtarget.hasMips32r2())
	setOperationAction(ISD::BSWAP, MVT::i32, Expand);
	if (!Subtarget.hasMips64r2())
	setOperationAction(ISD::BSWAP, MVT::i64, Expand);

	if (Subtarget.isGP64bit()) {
	setLoadExtAction(ISD::SEXTLOAD, MVT::i64, MVT::i32, Custom);
	setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, MVT::i32, Custom);
	setLoadExtAction(ISD::EXTLOAD, MVT::i64, MVT::i32, Custom);
	setTruncStoreAction(MVT::i64, MVT::i32, Custom);
	}

	setOperationAction(ISD::TRAP, MVT::Other, Legal);

	setTargetDAGCombine(ISD::SDIVREM);
	setTargetDAGCombine(ISD::UDIVREM);
	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::AND);
	setTargetDAGCombine(ISD::OR);
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::AssertZext);
	setTargetDAGCombine(ISD::SHL);

	if (ABI.IsO32()) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	}

	setMinFunctionAlignment(Subtarget.isGP64bit() ? 3 : 2);

	// The arguments on the stack are defined in terms of 4-byte slots on O32
	// and 8-byte slots on N32/N64.
	setMinStackArgumentAlignment((ABI.IsN32() \|\| ABI.IsN64()) ? 8 : 4);

	setStackPointerRegisterToSaveRestore(ABI.IsN64() ? Mips::SP_64 : Mips::SP);

	MaxStoresPerMemcpy = 16;

	isMicroMips = Subtarget.inMicroMipsMode();
	}

	const MipsTargetLowering *MipsTargetLowering::create(const MipsTargetMachine &TM,
	const MipsSubtarget &STI) {
	if (STI.inMips16Mode())
	return createMips16TargetLowering(TM, STI);

	return createMipsSETargetLowering(TM, STI);
	}

	// Create a fast isel object.
	FastISel *
	MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	const MipsTargetMachine &TM =
	static_cast<const MipsTargetMachine &>(funcInfo.MF->getTarget());

	// We support only the standard encoding [MIPS32,MIPS32R5] ISAs.
	bool UseFastISel = TM.Options.EnableFastISel && Subtarget.hasMips32() &&
	!Subtarget.hasMips32r6() && !Subtarget.inMips16Mode() &&
	!Subtarget.inMicroMipsMode();

	// Disable if either of the following is true:
	// We do not generate PIC, the ABI is not O32, LargeGOT is being used.
	if (!TM.isPositionIndependent() \|\| !TM.getABI().IsO32() \|\| LargeGOT)
	UseFastISel = false;

	return UseFastISel ? Mips::createFastISel(funcInfo, libInfo) : nullptr;
	}

	EVT MipsTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i32;
	return VT.changeVectorElementTypeToInteger();
	}

	static SDValue performDivRemCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const MipsSubtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	EVT Ty = N->getValueType(0);
	unsigned LO = (Ty == MVT::i32) ? Mips::LO0 : Mips::LO0_64;
	unsigned HI = (Ty == MVT::i32) ? Mips::HI0 : Mips::HI0_64;
	unsigned Opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem16 :
	MipsISD::DivRemU16;
	SDLoc DL(N);

	SDValue DivRem = DAG.getNode(Opc, DL, MVT::Glue,
	N->getOperand(0), N->getOperand(1));
	SDValue InChain = DAG.getEntryNode();
	SDValue InGlue = DivRem;

	// insert MFLO
	if (N->hasAnyUseOfValue(0)) {
	SDValue CopyFromLo = DAG.getCopyFromReg(InChain, DL, LO, Ty,
	InGlue);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyFromLo);
	InChain = CopyFromLo.getValue(1);
	InGlue = CopyFromLo.getValue(2);
	}

	// insert MFHI
	if (N->hasAnyUseOfValue(1)) {
	SDValue CopyFromHi = DAG.getCopyFromReg(InChain, DL,
	HI, Ty, InGlue);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), CopyFromHi);
	}

	return SDValue();
	}

	static Mips::CondCode condCodeToFCC(ISD::CondCode CC) {
	switch (CC) {
	default: llvm_unreachable("Unknown fp condition code!");
	case ISD::SETEQ:
	case ISD::SETOEQ: return Mips::FCOND_OEQ;
	case ISD::SETUNE: return Mips::FCOND_UNE;
	case ISD::SETLT:
	case ISD::SETOLT: return Mips::FCOND_OLT;
	case ISD::SETGT:
	case ISD::SETOGT: return Mips::FCOND_OGT;
	case ISD::SETLE:
	case ISD::SETOLE: return Mips::FCOND_OLE;
	case ISD::SETGE:
	case ISD::SETOGE: return Mips::FCOND_OGE;
	case ISD::SETULT: return Mips::FCOND_ULT;
	case ISD::SETULE: return Mips::FCOND_ULE;
	case ISD::SETUGT: return Mips::FCOND_UGT;
	case ISD::SETUGE: return Mips::FCOND_UGE;
	case ISD::SETUO: return Mips::FCOND_UN;
	case ISD::SETO: return Mips::FCOND_OR;
	case ISD::SETNE:
	case ISD::SETONE: return Mips::FCOND_ONE;
	case ISD::SETUEQ: return Mips::FCOND_UEQ;
	}
	}

	/// This function returns true if the floating point conditional branches and
	/// conditional moves which use condition code CC should be inverted.
	static bool invertFPCondCodeUser(Mips::CondCode CC) {
	if (CC >= Mips::FCOND_F && CC <= Mips::FCOND_NGT)
	return false;

	assert((CC >= Mips::FCOND_T && CC <= Mips::FCOND_GT) &&
	"Illegal Condition Code");

	return true;
	}

	// Creates and returns an FPCmp node from a setcc node.
	// Returns Op if setcc is not a floating point comparison.
	static SDValue createFPCmp(SelectionDAG &DAG, const SDValue &Op) {
	// must be a SETCC node
	if (Op.getOpcode() != ISD::SETCC)
	return Op;

	SDValue LHS = Op.getOperand(0);

	if (!LHS.getValueType().isFloatingPoint())
	return Op;

	SDValue RHS = Op.getOperand(1);
	SDLoc DL(Op);

	// Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of
	// node if necessary.
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

	return DAG.getNode(MipsISD::FPCmp, DL, MVT::Glue, LHS, RHS,
	DAG.getConstant(condCodeToFCC(CC), DL, MVT::i32));
	}

	// Creates and returns a CMovFPT/F node.
	static SDValue createCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True,
	SDValue False, const SDLoc &DL) {
	ConstantSDNode *CC = cast<ConstantSDNode>(Cond.getOperand(2));
	bool invert = invertFPCondCodeUser((Mips::CondCode)CC->getSExtValue());
	SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32);

	return DAG.getNode((invert ? MipsISD::CMovFP_F : MipsISD::CMovFP_T), DL,
	True.getValueType(), True, FCC0, False, Cond);
	}

	static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const MipsSubtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue SetCC = N->getOperand(0);

	if ((SetCC.getOpcode() != ISD::SETCC) \|\|
	!SetCC.getOperand(0).getValueType().isInteger())
	return SDValue();

	SDValue False = N->getOperand(2);
	EVT FalseTy = False.getValueType();

	if (!FalseTy.isInteger())
	return SDValue();

	ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(False);

	// If the RHS (False) is 0, we swap the order of the operands
	// of ISD::SELECT (obviously also inverting the condition) so that we can
	// take advantage of conditional moves using the $0 register.
	// Example:
	// return (a != 0) ? x : 0;
	// load $reg, x
	// movz $reg, $0, a
	if (!FalseC)
	return SDValue();

	const SDLoc DL(N);

	if (!FalseC->getZExtValue()) {
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
	SDValue True = N->getOperand(1);

	SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
	SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));

	return DAG.getNode(ISD::SELECT, DL, FalseTy, SetCC, False, True);
	}

	// If both operands are integer constants there's a possibility that we
	// can do some interesting optimizations.
	SDValue True = N->getOperand(1);
	ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(True);

	if (!TrueC \|\| !True.getValueType().isInteger())
	return SDValue();

	// We'll also ignore MVT::i64 operands as this optimizations proves
	// to be ineffective because of the required sign extensions as the result
	// of a SETCC operator is always MVT::i32 for non-vector types.
	if (True.getValueType() == MVT::i64)
	return SDValue();

	int64_t Diff = TrueC->getSExtValue() - FalseC->getSExtValue();

	// 1) (a < x) ? y : y-1
	// slti $reg1, a, x
	// addiu $reg2, $reg1, y-1
	if (Diff == 1)
	return DAG.getNode(ISD::ADD, DL, SetCC.getValueType(), SetCC, False);

	// 2) (a < x) ? y-1 : y
	// slti $reg1, a, x
	// xor $reg1, $reg1, 1
	// addiu $reg2, $reg1, y-1
	if (Diff == -1) {
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
	SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
	SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
	return DAG.getNode(ISD::ADD, DL, SetCC.getValueType(), SetCC, True);
	}

	// Couldn't optimize.
	return SDValue();
	}

	static SDValue performCMovFPCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const MipsSubtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue ValueIfTrue = N->getOperand(0), ValueIfFalse = N->getOperand(2);

	ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(ValueIfFalse);
	if (!FalseC \|\| FalseC->getZExtValue())
	return SDValue();

	// Since RHS (False) is 0, we swap the order of the True/False operands
	// (obviously also inverting the condition) so that we can
	// take advantage of conditional moves using the $0 register.
	// Example:
	// return (a != 0) ? x : 0;
	// load $reg, x
	// movz $reg, $0, a
	unsigned Opc = (N->getOpcode() == MipsISD::CMovFP_T) ? MipsISD::CMovFP_F :
	MipsISD::CMovFP_T;

	SDValue FCC = N->getOperand(1), Glue = N->getOperand(3);
	return DAG.getNode(Opc, SDLoc(N), ValueIfFalse.getValueType(),
	ValueIfFalse, FCC, ValueIfTrue, Glue);
	}

	static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const MipsSubtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps() \|\| !Subtarget.hasExtractInsert())
	return SDValue();

	SDValue FirstOperand = N->getOperand(0);
	unsigned FirstOperandOpc = FirstOperand.getOpcode();
	SDValue Mask = N->getOperand(1);
	EVT ValTy = N->getValueType(0);
	SDLoc DL(N);

	uint64_t Pos = 0, SMPos, SMSize;
	ConstantSDNode *CN;
	SDValue NewOperand;
	unsigned Opc;

	// Op's second operand must be a shifted mask.
	if (!(CN = dyn_cast<ConstantSDNode>(Mask)) \|\|
	!isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
	return SDValue();

	if (FirstOperandOpc == ISD::SRA \|\| FirstOperandOpc == ISD::SRL) {
	// Pattern match EXT.
	// $dst = and ((sra or srl) $src , pos), (2**size - 1)
	// => ext $dst, $src, pos, size

	// The second operand of the shift must be an immediate.
	if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))))
	return SDValue();

	Pos = CN->getZExtValue();

	// Return if the shifted mask does not start at bit 0 or the sum of its size
	// and Pos exceeds the word's size.
	if (SMPos != 0 \|\| Pos + SMSize > ValTy.getSizeInBits())
	return SDValue();

	Opc = MipsISD::Ext;
	NewOperand = FirstOperand.getOperand(0);
	} else if (FirstOperandOpc == ISD::SHL && Subtarget.hasCnMips()) {
	// Pattern match CINS.
	// $dst = and (shl $src , pos), mask
	// => cins $dst, $src, pos, size
	// mask is a shifted mask with consecutive 1's, pos = shift amount,
	// size = population count.

	// The second operand of the shift must be an immediate.
	if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))))
	return SDValue();

	Pos = CN->getZExtValue();

	if (SMPos != Pos \|\| Pos >= ValTy.getSizeInBits() \|\| SMSize >= 32 \|\|
	Pos + SMSize > ValTy.getSizeInBits())
	return SDValue();

	NewOperand = FirstOperand.getOperand(0);
	// SMSize is 'location' (position) in this case, not size.
	SMSize--;
	Opc = MipsISD::CIns;
	} else {
	// Pattern match EXT.
	// $dst = and $src, (2**size - 1) , if size > 16
	// => ext $dst, $src, pos, size , pos = 0

	// If the mask is <= 0xffff, andi can be used instead.
	if (CN->getZExtValue() <= 0xffff)
	return SDValue();

	// Return if the mask doesn't start at position 0.
	if (SMPos)
	return SDValue();

	Opc = MipsISD::Ext;
	NewOperand = FirstOperand;
	}
	return DAG.getNode(Opc, DL, ValTy, NewOperand,
	DAG.getConstant(Pos, DL, MVT::i32),
	DAG.getConstant(SMSize, DL, MVT::i32));
	}

	static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const MipsSubtarget &Subtarget) {
	// Pattern match INS.
	// $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1),
	// where mask1 = (2**size - 1) << pos, mask0 = ~mask1
	// => ins $dst, $src, size, pos, $src1
	if (DCI.isBeforeLegalizeOps() \|\| !Subtarget.hasExtractInsert())
	return SDValue();

	SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
	uint64_t SMPos0, SMSize0, SMPos1, SMSize1;
	ConstantSDNode CN, CN1;

	// See if Op's first operand matches (and $src1 , mask0).
	if (And0.getOpcode() != ISD::AND)
	return SDValue();

	if (!(CN = dyn_cast<ConstantSDNode>(And0.getOperand(1))) \|\|
	!isShiftedMask(~CN->getSExtValue(), SMPos0, SMSize0))
	return SDValue();

	// See if Op's second operand matches (and (shl $src, pos), mask1).
	if (And1.getOpcode() == ISD::AND &&
	And1.getOperand(0).getOpcode() == ISD::SHL) {

	if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) \|\|
	!isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
	return SDValue();

	// The shift masks must have the same position and size.
	if (SMPos0 != SMPos1 \|\| SMSize0 != SMSize1)
	return SDValue();

	SDValue Shl = And1.getOperand(0);

	if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
	return SDValue();

	unsigned Shamt = CN->getZExtValue();

	// Return if the shift amount and the first bit position of mask are not the
	// same.
	EVT ValTy = N->getValueType(0);
	if ((Shamt != SMPos0) \|\| (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
	return SDValue();

	SDLoc DL(N);
	return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0),
	DAG.getConstant(SMPos0, DL, MVT::i32),
	DAG.getConstant(SMSize0, DL, MVT::i32),
	And0.getOperand(0));
	} else {
	// Pattern match DINS.
	// $dst = or (and $src, mask0), mask1
	// where mask0 = ((1 << SMSize0) -1) << SMPos0
	// => dins $dst, $src, pos, size
	if (~CN->getSExtValue() == ((((int64_t)1 << SMSize0) - 1) << SMPos0) &&
	((SMSize0 + SMPos0 <= 64 && Subtarget.hasMips64r2()) \|\|
	(SMSize0 + SMPos0 <= 32))) {
	// Check if AND instruction has constant as argument
	bool isConstCase = And1.getOpcode() != ISD::AND;
	if (And1.getOpcode() == ISD::AND) {
	if (!(CN1 = dyn_cast<ConstantSDNode>(And1->getOperand(1))))
	return SDValue();
	} else {
	if (!(CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1))))
	return SDValue();
	}
	// Don't generate INS if constant OR operand doesn't fit into bits
	// cleared by constant AND operand.
	if (CN->getSExtValue() & CN1->getSExtValue())
	return SDValue();

	SDLoc DL(N);
	EVT ValTy = N->getOperand(0)->getValueType(0);
	SDValue Const1;
	SDValue SrlX;
	if (!isConstCase) {
	Const1 = DAG.getConstant(SMPos0, DL, MVT::i32);
	SrlX = DAG.getNode(ISD::SRL, DL, And1->getValueType(0), And1, Const1);
	}
	return DAG.getNode(
	MipsISD::Ins, DL, N->getValueType(0),
	isConstCase
	? DAG.getConstant(CN1->getSExtValue() >> SMPos0, DL, ValTy)
	: SrlX,
	DAG.getConstant(SMPos0, DL, MVT::i32),
	DAG.getConstant(ValTy.getSizeInBits() / 8 < 8 ? SMSize0 & 31
	: SMSize0,
	DL, MVT::i32),
	And0->getOperand(0));

	}
	return SDValue();
	}
	}

	static SDValue performMADD_MSUBCombine(SDNode *ROOTNode, SelectionDAG &CurDAG,
	const MipsSubtarget &Subtarget) {
	// ROOTNode must have a multiplication as an operand for the match to be
	// successful.
	if (ROOTNode->getOperand(0).getOpcode() != ISD::MUL &&
	ROOTNode->getOperand(1).getOpcode() != ISD::MUL)
	return SDValue();

	// We don't handle vector types here.
	if (ROOTNode->getValueType(0).isVector())
	return SDValue();

	// For MIPS64, madd / msub instructions are inefficent to use with 64 bit
	// arithmetic. E.g.
	// (add (mul a b) c) =>
	// let res = (madd (mthi (drotr c 32))x(mtlo c) a b) in
	// MIPS64: (or (dsll (mfhi res) 32) (dsrl (dsll (mflo res) 32) 32)
	// or
	// MIPS64R2: (dins (mflo res) (mfhi res) 32 32)
	//
	// The overhead of setting up the Hi/Lo registers and reassembling the
	// result makes this a dubious optimzation for MIPS64. The core of the
	// problem is that Hi/Lo contain the upper and lower 32 bits of the
	// operand and result.
	//
	// It requires a chain of 4 add/mul for MIPS64R2 to get better code
	// density than doing it naively, 5 for MIPS64. Additionally, using
	// madd/msub on MIPS64 requires the operands actually be 32 bit sign
	// extended operands, not true 64 bit values.
	//
	// FIXME: For the moment, disable this completely for MIPS64.
	if (Subtarget.hasMips64())
	return SDValue();

	SDValue Mult = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
	? ROOTNode->getOperand(0)
	: ROOTNode->getOperand(1);

	SDValue AddOperand = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
	? ROOTNode->getOperand(1)
	: ROOTNode->getOperand(0);

	// Transform this to a MADD only if the user of this node is the add.
	// If there are other users of the mul, this function returns here.
	if (!Mult.hasOneUse())
	return SDValue();

	// maddu and madd are unusual instructions in that on MIPS64 bits 63..31
	// must be in canonical form, i.e. sign extended. For MIPS32, the operands
	// of the multiply must have 32 or more sign bits, otherwise we cannot
	// perform this optimization. We have to check this here as we're performing
	// this optimization pre-legalization.
	SDValue MultLHS = Mult->getOperand(0);
	SDValue MultRHS = Mult->getOperand(1);

	bool IsSigned = MultLHS->getOpcode() == ISD::SIGN_EXTEND &&
	MultRHS->getOpcode() == ISD::SIGN_EXTEND;
	bool IsUnsigned = MultLHS->getOpcode() == ISD::ZERO_EXTEND &&
	MultRHS->getOpcode() == ISD::ZERO_EXTEND;

	if (!IsSigned && !IsUnsigned)
	return SDValue();

	// Initialize accumulator.
	SDLoc DL(ROOTNode);
	SDValue TopHalf;
	SDValue BottomHalf;
	BottomHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
	CurDAG.getIntPtrConstant(0, DL));

	TopHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
	CurDAG.getIntPtrConstant(1, DL));
	SDValue ACCIn = CurDAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
	BottomHalf,
	TopHalf);

	// Create MipsMAdd(u) / MipsMSub(u) node.
	bool IsAdd = ROOTNode->getOpcode() == ISD::ADD;
	unsigned Opcode = IsAdd ? (IsUnsigned ? MipsISD::MAddu : MipsISD::MAdd)
	: (IsUnsigned ? MipsISD::MSubu : MipsISD::MSub);
	SDValue MAddOps[3] = {
	CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(0)),
	CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(1)), ACCIn};
	EVT VTs[2] = {MVT::i32, MVT::i32};
	SDValue MAdd = CurDAG.getNode(Opcode, DL, VTs, MAddOps);

	SDValue ResLo = CurDAG.getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
	SDValue ResHi = CurDAG.getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
	SDValue Combined =
	CurDAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi);
	return Combined;
	}

	static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const MipsSubtarget &Subtarget) {
	// (sub v0 (mul v1, v2)) => (msub v1, v2, v0)
	if (DCI.isBeforeLegalizeOps()) {
	if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
	!Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
	return performMADD_MSUBCombine(N, DAG, Subtarget);

	return SDValue();
	}

	return SDValue();
	}

	static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const MipsSubtarget &Subtarget) {
	// (add v0 (mul v1, v2)) => (madd v1, v2, v0)
	if (DCI.isBeforeLegalizeOps()) {
	if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
	!Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
	return performMADD_MSUBCombine(N, DAG, Subtarget);

	return SDValue();
	}

	// (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
	SDValue Add = N->getOperand(1);

	if (Add.getOpcode() != ISD::ADD)
	return SDValue();

	SDValue Lo = Add.getOperand(1);

	if ((Lo.getOpcode() != MipsISD::Lo) \|\|
	(Lo.getOperand(0).getOpcode() != ISD::TargetJumpTable))
	return SDValue();

	EVT ValTy = N->getValueType(0);
	SDLoc DL(N);

	SDValue Add1 = DAG.getNode(ISD::ADD, DL, ValTy, N->getOperand(0),
	Add.getOperand(0));
	return DAG.getNode(ISD::ADD, DL, ValTy, Add1, Lo);
	}

	static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const MipsSubtarget &Subtarget) {
	// Pattern match CINS.
	// $dst = shl (and $src , imm), pos
	// => cins $dst, $src, pos, size

	if (DCI.isBeforeLegalizeOps() \|\| !Subtarget.hasCnMips())
	return SDValue();

	SDValue FirstOperand = N->getOperand(0);
	unsigned FirstOperandOpc = FirstOperand.getOpcode();
	SDValue SecondOperand = N->getOperand(1);
	EVT ValTy = N->getValueType(0);
	SDLoc DL(N);

	uint64_t Pos = 0, SMPos, SMSize;
	ConstantSDNode *CN;
	SDValue NewOperand;

	// The second operand of the shift must be an immediate.
	if (!(CN = dyn_cast<ConstantSDNode>(SecondOperand)))
	return SDValue();

	Pos = CN->getZExtValue();

	if (Pos >= ValTy.getSizeInBits())
	return SDValue();

	if (FirstOperandOpc != ISD::AND)
	return SDValue();

	// AND's second operand must be a shifted mask.
	if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))) \|\|
	!isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
	return SDValue();

	// Return if the shifted mask does not start at bit 0 or the sum of its size
	// and Pos exceeds the word's size.
	if (SMPos != 0 \|\| SMSize > 32 \|\| Pos + SMSize > ValTy.getSizeInBits())
	return SDValue();

	NewOperand = FirstOperand.getOperand(0);
	// SMSize is 'location' (position) in this case, not size.
	SMSize--;

	return DAG.getNode(MipsISD::CIns, DL, ValTy, NewOperand,
	DAG.getConstant(Pos, DL, MVT::i32),
	DAG.getConstant(SMSize, DL, MVT::i32));
	}

	SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
	const {
	SelectionDAG &DAG = DCI.DAG;
	unsigned Opc = N->getOpcode();

	switch (Opc) {
	default: break;
	case ISD::SDIVREM:
	case ISD::UDIVREM:
	return performDivRemCombine(N, DAG, DCI, Subtarget);
	case ISD::SELECT:
	return performSELECTCombine(N, DAG, DCI, Subtarget);
	case MipsISD::CMovFP_F:
	case MipsISD::CMovFP_T:
	return performCMovFPCombine(N, DAG, DCI, Subtarget);
	case ISD::AND:
	return performANDCombine(N, DAG, DCI, Subtarget);
	case ISD::OR:
	return performORCombine(N, DAG, DCI, Subtarget);
	case ISD::ADD:
	return performADDCombine(N, DAG, DCI, Subtarget);
	case ISD::SHL:
	return performSHLCombine(N, DAG, DCI, Subtarget);
	case ISD::SUB:
	return performSUBCombine(N, DAG, DCI, Subtarget);
	}

	return SDValue();
	}

	bool MipsTargetLowering::isCheapToSpeculateCttz() const {
	return Subtarget.hasMips32();
	}

	bool MipsTargetLowering::isCheapToSpeculateCtlz() const {
	return Subtarget.hasMips32();
	}

	void
	MipsTargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDValue Res = LowerOperation(SDValue(N, 0), DAG);

	for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
	Results.push_back(Res.getValue(I));
	}

	void
	MipsTargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	return LowerOperationWrapper(N, Results, DAG);
	}

	SDValue MipsTargetLowering::
	LowerOperation(SDValue Op, SelectionDAG &DAG) const
	{
	switch (Op.getOpcode())
	{
	case ISD::BRCOND: return lowerBRCOND(Op, DAG);
	case ISD::ConstantPool: return lowerConstantPool(Op, DAG);
	case ISD::GlobalAddress: return lowerGlobalAddress(Op, DAG);
	case ISD::BlockAddress: return lowerBlockAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return lowerGlobalTLSAddress(Op, DAG);
	case ISD::JumpTable: return lowerJumpTable(Op, DAG);
	case ISD::SELECT: return lowerSELECT(Op, DAG);
	case ISD::SETCC: return lowerSETCC(Op, DAG);
	case ISD::VASTART: return lowerVASTART(Op, DAG);
	case ISD::VAARG: return lowerVAARG(Op, DAG);
	case ISD::FCOPYSIGN: return lowerFCOPYSIGN(Op, DAG);
	case ISD::FRAMEADDR: return lowerFRAMEADDR(Op, DAG);
	case ISD::RETURNADDR: return lowerRETURNADDR(Op, DAG);
	case ISD::EH_RETURN: return lowerEH_RETURN(Op, DAG);
	case ISD::ATOMIC_FENCE: return lowerATOMIC_FENCE(Op, DAG);
	case ISD::SHL_PARTS: return lowerShiftLeftParts(Op, DAG);
	case ISD::SRA_PARTS: return lowerShiftRightParts(Op, DAG, true);
	case ISD::SRL_PARTS: return lowerShiftRightParts(Op, DAG, false);
	case ISD::LOAD: return lowerLOAD(Op, DAG);
	case ISD::STORE: return lowerSTORE(Op, DAG);
	case ISD::EH_DWARF_CFA: return lowerEH_DWARF_CFA(Op, DAG);
	case ISD::FP_TO_SINT: return lowerFP_TO_SINT(Op, DAG);
	}
	return SDValue();
	}

	//===----------------------------------------------------------------------===//
	// Lower helper functions
	//===----------------------------------------------------------------------===//

	// addLiveIn - This helper function adds the specified physical register to the
	// MachineFunction as a live in value. It also creates a corresponding
	// virtual register for it.
	static unsigned
	addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
	{
	unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
	MF.getRegInfo().addLiveIn(PReg, VReg);
	return VReg;
	}

	static MachineBasicBlock *insertDivByZeroTrap(MachineInstr &MI,
	MachineBasicBlock &MBB,
	const TargetInstrInfo &TII,
	bool Is64Bit, bool IsMicroMips) {
	if (NoZeroDivCheck)
	return &MBB;

	// Insert instruction "teq $divisor_reg, $zero, 7".
	MachineBasicBlock::iterator I(MI);
	MachineInstrBuilder MIB;
	MachineOperand &Divisor = MI.getOperand(2);
	MIB = BuildMI(MBB, std::next(I), MI.getDebugLoc(),
	TII.get(IsMicroMips ? Mips::TEQ_MM : Mips::TEQ))
	.addReg(Divisor.getReg(), getKillRegState(Divisor.isKill()))
	.addReg(Mips::ZERO)
	.addImm(7);

	// Use the 32-bit sub-register if this is a 64-bit division.
	if (Is64Bit)
	MIB->getOperand(0).setSubReg(Mips::sub_32);

	// Clear Divisor's kill flag.
	Divisor.setIsKill(false);

	// We would normally delete the original instruction here but in this case
	// we only needed to inject an additional instruction rather than replace it.

	return &MBB;
	}

	MachineBasicBlock *
	MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	switch (MI.getOpcode()) {
	default:
	llvm_unreachable("Unexpected instr type to insert");
	case Mips::ATOMIC_LOAD_ADD_I8:
	return emitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
	case Mips::ATOMIC_LOAD_ADD_I16:
	return emitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
	case Mips::ATOMIC_LOAD_ADD_I32:
	return emitAtomicBinary(MI, BB, 4, Mips::ADDu);
	case Mips::ATOMIC_LOAD_ADD_I64:
	return emitAtomicBinary(MI, BB, 8, Mips::DADDu);

	case Mips::ATOMIC_LOAD_AND_I8:
	return emitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
	case Mips::ATOMIC_LOAD_AND_I16:
	return emitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
	case Mips::ATOMIC_LOAD_AND_I32:
	return emitAtomicBinary(MI, BB, 4, Mips::AND);
	case Mips::ATOMIC_LOAD_AND_I64:
	return emitAtomicBinary(MI, BB, 8, Mips::AND64);

	case Mips::ATOMIC_LOAD_OR_I8:
	return emitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
	case Mips::ATOMIC_LOAD_OR_I16:
	return emitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
	case Mips::ATOMIC_LOAD_OR_I32:
	return emitAtomicBinary(MI, BB, 4, Mips::OR);
	case Mips::ATOMIC_LOAD_OR_I64:
	return emitAtomicBinary(MI, BB, 8, Mips::OR64);

	case Mips::ATOMIC_LOAD_XOR_I8:
	return emitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
	case Mips::ATOMIC_LOAD_XOR_I16:
	return emitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
	case Mips::ATOMIC_LOAD_XOR_I32:
	return emitAtomicBinary(MI, BB, 4, Mips::XOR);
	case Mips::ATOMIC_LOAD_XOR_I64:
	return emitAtomicBinary(MI, BB, 8, Mips::XOR64);

	case Mips::ATOMIC_LOAD_NAND_I8:
	return emitAtomicBinaryPartword(MI, BB, 1, 0, true);
	case Mips::ATOMIC_LOAD_NAND_I16:
	return emitAtomicBinaryPartword(MI, BB, 2, 0, true);
	case Mips::ATOMIC_LOAD_NAND_I32:
	return emitAtomicBinary(MI, BB, 4, 0, true);
	case Mips::ATOMIC_LOAD_NAND_I64:
	return emitAtomicBinary(MI, BB, 8, 0, true);

	case Mips::ATOMIC_LOAD_SUB_I8:
	return emitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
	case Mips::ATOMIC_LOAD_SUB_I16:
	return emitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
	case Mips::ATOMIC_LOAD_SUB_I32:
	return emitAtomicBinary(MI, BB, 4, Mips::SUBu);
	case Mips::ATOMIC_LOAD_SUB_I64:
	return emitAtomicBinary(MI, BB, 8, Mips::DSUBu);

	case Mips::ATOMIC_SWAP_I8:
	return emitAtomicBinaryPartword(MI, BB, 1, 0);
	case Mips::ATOMIC_SWAP_I16:
	return emitAtomicBinaryPartword(MI, BB, 2, 0);
	case Mips::ATOMIC_SWAP_I32:
	return emitAtomicBinary(MI, BB, 4, 0);
	case Mips::ATOMIC_SWAP_I64:
	return emitAtomicBinary(MI, BB, 8, 0);

	case Mips::ATOMIC_CMP_SWAP_I8:
	return emitAtomicCmpSwapPartword(MI, BB, 1);
	case Mips::ATOMIC_CMP_SWAP_I16:
	return emitAtomicCmpSwapPartword(MI, BB, 2);
	case Mips::ATOMIC_CMP_SWAP_I32:
	return emitAtomicCmpSwap(MI, BB, 4);
	case Mips::ATOMIC_CMP_SWAP_I64:
	return emitAtomicCmpSwap(MI, BB, 8);
	case Mips::PseudoSDIV:
	case Mips::PseudoUDIV:
	case Mips::DIV:
	case Mips::DIVU:
	case Mips::MOD:
	case Mips::MODU:
	return insertDivByZeroTrap(MI, BB, Subtarget.getInstrInfo(), false,
	false);
	case Mips::SDIV_MM_Pseudo:
	case Mips::UDIV_MM_Pseudo:
	case Mips::SDIV_MM:
	case Mips::UDIV_MM:
	case Mips::DIV_MMR6:
	case Mips::DIVU_MMR6:
	case Mips::MOD_MMR6:
	case Mips::MODU_MMR6:
	return insertDivByZeroTrap(MI, BB, Subtarget.getInstrInfo(), false, true);
	case Mips::PseudoDSDIV:
	case Mips::PseudoDUDIV:
	case Mips::DDIV:
	case Mips::DDIVU:
	case Mips::DMOD:
	case Mips::DMODU:
	return insertDivByZeroTrap(MI, BB, Subtarget.getInstrInfo(), true, false);

	case Mips::PseudoSELECT_I:
	case Mips::PseudoSELECT_I64:
	case Mips::PseudoSELECT_S:
	case Mips::PseudoSELECT_D32:
	case Mips::PseudoSELECT_D64:
	return emitPseudoSELECT(MI, BB, false, Mips::BNE);
	case Mips::PseudoSELECTFP_F_I:
	case Mips::PseudoSELECTFP_F_I64:
	case Mips::PseudoSELECTFP_F_S:
	case Mips::PseudoSELECTFP_F_D32:
	case Mips::PseudoSELECTFP_F_D64:
	return emitPseudoSELECT(MI, BB, true, Mips::BC1F);
	case Mips::PseudoSELECTFP_T_I:
	case Mips::PseudoSELECTFP_T_I64:
	case Mips::PseudoSELECTFP_T_S:
	case Mips::PseudoSELECTFP_T_D32:
	case Mips::PseudoSELECTFP_T_D64:
	return emitPseudoSELECT(MI, BB, true, Mips::BC1T);
	}
	}

	// This function also handles Mips::ATOMIC_SWAP_I32 (when BinOpcode == 0), and
	// Mips::ATOMIC_LOAD_NAND_I32 (when Nand == true)
	MachineBasicBlock *MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
	MachineBasicBlock *BB,
	unsigned Size,
	unsigned BinOpcode,
	bool Nand) const {
	assert((Size == 4 \|\| Size == 8) && "Unsupported size for EmitAtomicBinary.");

	MachineFunction *MF = BB->getParent();
	MachineRegisterInfo &RegInfo = MF->getRegInfo();
	const TargetRegisterClass RC = getRegClassFor(MVT::getIntegerVT(Size 8));
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const bool ArePtrs64bit = ABI.ArePtrs64bit();
	DebugLoc DL = MI.getDebugLoc();
	unsigned LL, SC, AND, NOR, ZERO, BEQ;

	if (Size == 4) {
	if (isMicroMips) {
	LL = Mips::LL_MM;
	SC = Mips::SC_MM;
	} else {
	LL = Subtarget.hasMips32r6()
	? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
	: (ArePtrs64bit ? Mips::LL64 : Mips::LL);
	SC = Subtarget.hasMips32r6()
	? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
	: (ArePtrs64bit ? Mips::SC64 : Mips::SC);
	}

	AND = Mips::AND;
	NOR = Mips::NOR;
	ZERO = Mips::ZERO;
	BEQ = Mips::BEQ;
	} else {
	LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
	SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
	AND = Mips::AND64;
	NOR = Mips::NOR64;
	ZERO = Mips::ZERO_64;
	BEQ = Mips::BEQ64;
	}

	unsigned OldVal = MI.getOperand(0).getReg();
	unsigned Ptr = MI.getOperand(1).getReg();
	unsigned Incr = MI.getOperand(2).getReg();

	unsigned StoreVal = RegInfo.createVirtualRegister(RC);
	unsigned AndRes = RegInfo.createVirtualRegister(RC);
	unsigned Success = RegInfo.createVirtualRegister(RC);

	// insert new blocks after the current block
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineFunction::iterator It = ++BB->getIterator();
	MF->insert(It, loopMBB);
	MF->insert(It, exitMBB);

	// Transfer the remainder of BB and its successor edges to exitMBB.
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	// thisMBB:
	// ...
	// fallthrough --> loopMBB
	BB->addSuccessor(loopMBB);
	loopMBB->addSuccessor(loopMBB);
	loopMBB->addSuccessor(exitMBB);

	// loopMBB:
	// ll oldval, 0(ptr)
	// <binop> storeval, oldval, incr
	// sc success, storeval, 0(ptr)
	// beq success, $0, loopMBB
	BB = loopMBB;
	BuildMI(BB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
	if (Nand) {
	// and andres, oldval, incr
	// nor storeval, $0, andres
	BuildMI(BB, DL, TII->get(AND), AndRes).addReg(OldVal).addReg(Incr);
	BuildMI(BB, DL, TII->get(NOR), StoreVal).addReg(ZERO).addReg(AndRes);
	} else if (BinOpcode) {
	// <binop> storeval, oldval, incr
	BuildMI(BB, DL, TII->get(BinOpcode), StoreVal).addReg(OldVal).addReg(Incr);
	} else {
	StoreVal = Incr;
	}
	BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
	BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);

	MI.eraseFromParent(); // The instruction is gone now.

	return exitMBB;
	}

	MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
	MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned DstReg,
	unsigned SrcReg) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const DebugLoc &DL = MI.getDebugLoc();

	if (Subtarget.hasMips32r2() && Size == 1) {
	BuildMI(BB, DL, TII->get(Mips::SEB), DstReg).addReg(SrcReg);
	return BB;
	}

	if (Subtarget.hasMips32r2() && Size == 2) {
	BuildMI(BB, DL, TII->get(Mips::SEH), DstReg).addReg(SrcReg);
	return BB;
	}

	MachineFunction *MF = BB->getParent();
	MachineRegisterInfo &RegInfo = MF->getRegInfo();
	const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
	unsigned ScrReg = RegInfo.createVirtualRegister(RC);

	assert(Size < 32);
	int64_t ShiftImm = 32 - (Size * 8);

	BuildMI(BB, DL, TII->get(Mips::SLL), ScrReg).addReg(SrcReg).addImm(ShiftImm);
	BuildMI(BB, DL, TII->get(Mips::SRA), DstReg).addReg(ScrReg).addImm(ShiftImm);

	return BB;
	}

	MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
	MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
	bool Nand) const {
	assert((Size == 1 \|\| Size == 2) &&
	"Unsupported size for EmitAtomicBinaryPartial.");

	MachineFunction *MF = BB->getParent();
	MachineRegisterInfo &RegInfo = MF->getRegInfo();
	const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
	const bool ArePtrs64bit = ABI.ArePtrs64bit();
	const TargetRegisterClass *RCp =
	getRegClassFor(ArePtrs64bit ? MVT::i64 : MVT::i32);
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	unsigned Dest = MI.getOperand(0).getReg();
	unsigned Ptr = MI.getOperand(1).getReg();
	unsigned Incr = MI.getOperand(2).getReg();

	unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp);
	unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
	unsigned Mask = RegInfo.createVirtualRegister(RC);
	unsigned Mask2 = RegInfo.createVirtualRegister(RC);
	unsigned NewVal = RegInfo.createVirtualRegister(RC);
	unsigned OldVal = RegInfo.createVirtualRegister(RC);
	unsigned Incr2 = RegInfo.createVirtualRegister(RC);
	unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
	unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
	unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
	unsigned AndRes = RegInfo.createVirtualRegister(RC);
	unsigned BinOpRes = RegInfo.createVirtualRegister(RC);
	unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
	unsigned StoreVal = RegInfo.createVirtualRegister(RC);
	unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
	unsigned SrlRes = RegInfo.createVirtualRegister(RC);
	unsigned Success = RegInfo.createVirtualRegister(RC);

	unsigned LL, SC;
	if (isMicroMips) {
	LL = Mips::LL_MM;
	SC = Mips::SC_MM;
	} else {
	LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
	: (ArePtrs64bit ? Mips::LL64 : Mips::LL);
	SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
	: (ArePtrs64bit ? Mips::SC64 : Mips::SC);
	}

	// insert new blocks after the current block
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineFunction::iterator It = ++BB->getIterator();
	MF->insert(It, loopMBB);
	MF->insert(It, sinkMBB);
	MF->insert(It, exitMBB);

	// Transfer the remainder of BB and its successor edges to exitMBB.
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	BB->addSuccessor(loopMBB);
	loopMBB->addSuccessor(loopMBB);
	loopMBB->addSuccessor(sinkMBB);
	sinkMBB->addSuccessor(exitMBB);

	// thisMBB:
	// addiu masklsb2,$0,-4 # 0xfffffffc
	// and alignedaddr,ptr,masklsb2
	// andi ptrlsb2,ptr,3
	// sll shiftamt,ptrlsb2,3
	// ori maskupper,$0,255 # 0xff
	// sll mask,maskupper,shiftamt
	// nor mask2,$0,mask
	// sll incr2,incr,shiftamt

	int64_t MaskImm = (Size == 1) ? 255 : 65535;
	BuildMI(BB, DL, TII->get(ABI.GetPtrAddiuOp()), MaskLSB2)
	.addReg(ABI.GetNullPtr()).addImm(-4);
	BuildMI(BB, DL, TII->get(ABI.GetPtrAndOp()), AlignedAddr)
	.addReg(Ptr).addReg(MaskLSB2);
	BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2)
	.addReg(Ptr, 0, ArePtrs64bit ? Mips::sub_32 : 0).addImm(3);
	if (Subtarget.isLittle()) {
	BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
	} else {
	unsigned Off = RegInfo.createVirtualRegister(RC);
	BuildMI(BB, DL, TII->get(Mips::XORi), Off)
	.addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
	BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
	}
	BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
	.addReg(Mips::ZERO).addImm(MaskImm);
	BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
	.addReg(MaskUpper).addReg(ShiftAmt);
	BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
	BuildMI(BB, DL, TII->get(Mips::SLLV), Incr2).addReg(Incr).addReg(ShiftAmt);

	// atomic.load.binop
	// loopMBB:
	// ll oldval,0(alignedaddr)
	// binop binopres,oldval,incr2
	// and newval,binopres,mask
	// and maskedoldval0,oldval,mask2
	// or storeval,maskedoldval0,newval
	// sc success,storeval,0(alignedaddr)
	// beq success,$0,loopMBB

	// atomic.swap
	// loopMBB:
	// ll oldval,0(alignedaddr)
	// and newval,incr2,mask
	// and maskedoldval0,oldval,mask2
	// or storeval,maskedoldval0,newval
	// sc success,storeval,0(alignedaddr)
	// beq success,$0,loopMBB

	BB = loopMBB;
	BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
	if (Nand) {
	// and andres, oldval, incr2
	// nor binopres, $0, andres
	// and newval, binopres, mask
	BuildMI(BB, DL, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr2);
	BuildMI(BB, DL, TII->get(Mips::NOR), BinOpRes)
	.addReg(Mips::ZERO).addReg(AndRes);
	BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
	} else if (BinOpcode) {
	// <binop> binopres, oldval, incr2
	// and newval, binopres, mask
	BuildMI(BB, DL, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
	BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
	} else { // atomic.swap
	// and newval, incr2, mask
	BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
	}

	BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
	.addReg(OldVal).addReg(Mask2);
	BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
	.addReg(MaskedOldVal0).addReg(NewVal);
	BuildMI(BB, DL, TII->get(SC), Success)
	.addReg(StoreVal).addReg(AlignedAddr).addImm(0);
	BuildMI(BB, DL, TII->get(Mips::BEQ))
	.addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);

	// sinkMBB:
	// and maskedoldval1,oldval,mask
	// srl srlres,maskedoldval1,shiftamt
	// sign_extend dest,srlres
	BB = sinkMBB;

	BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
	.addReg(OldVal).addReg(Mask);
	BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
	.addReg(MaskedOldVal1).addReg(ShiftAmt);
	BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);

	MI.eraseFromParent(); // The instruction is gone now.

	return exitMBB;
	}

	MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
	MachineBasicBlock *BB,
	unsigned Size) const {
	assert((Size == 4 \|\| Size == 8) && "Unsupported size for EmitAtomicCmpSwap.");

	MachineFunction *MF = BB->getParent();
	MachineRegisterInfo &RegInfo = MF->getRegInfo();
	const TargetRegisterClass RC = getRegClassFor(MVT::getIntegerVT(Size 8));
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const bool ArePtrs64bit = ABI.ArePtrs64bit();
	DebugLoc DL = MI.getDebugLoc();
	unsigned LL, SC, ZERO, BNE, BEQ;

	if (Size == 4) {
	if (isMicroMips) {
	LL = Mips::LL_MM;
	SC = Mips::SC_MM;
	} else {
	LL = Subtarget.hasMips32r6()
	? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
	: (ArePtrs64bit ? Mips::LL64 : Mips::LL);
	SC = Subtarget.hasMips32r6()
	? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
	: (ArePtrs64bit ? Mips::SC64 : Mips::SC);
	}

	ZERO = Mips::ZERO;
	BNE = Mips::BNE;
	BEQ = Mips::BEQ;
	} else {
	LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
	SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
	ZERO = Mips::ZERO_64;
	BNE = Mips::BNE64;
	BEQ = Mips::BEQ64;
	}

	unsigned Dest = MI.getOperand(0).getReg();
	unsigned Ptr = MI.getOperand(1).getReg();
	unsigned OldVal = MI.getOperand(2).getReg();
	unsigned NewVal = MI.getOperand(3).getReg();

	unsigned Success = RegInfo.createVirtualRegister(RC);

	// insert new blocks after the current block
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineFunction::iterator It = ++BB->getIterator();
	MF->insert(It, loop1MBB);
	MF->insert(It, loop2MBB);
	MF->insert(It, exitMBB);

	// Transfer the remainder of BB and its successor edges to exitMBB.
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	// thisMBB:
	// ...
	// fallthrough --> loop1MBB
	BB->addSuccessor(loop1MBB);
	loop1MBB->addSuccessor(exitMBB);
	loop1MBB->addSuccessor(loop2MBB);
	loop2MBB->addSuccessor(loop1MBB);
	loop2MBB->addSuccessor(exitMBB);

	// loop1MBB:
	// ll dest, 0(ptr)
	// bne dest, oldval, exitMBB
	BB = loop1MBB;
	BuildMI(BB, DL, TII->get(LL), Dest).addReg(Ptr).addImm(0);
	BuildMI(BB, DL, TII->get(BNE))
	.addReg(Dest).addReg(OldVal).addMBB(exitMBB);

	// loop2MBB:
	// sc success, newval, 0(ptr)
	// beq success, $0, loop1MBB
	BB = loop2MBB;
	BuildMI(BB, DL, TII->get(SC), Success)
	.addReg(NewVal).addReg(Ptr).addImm(0);
	BuildMI(BB, DL, TII->get(BEQ))
	.addReg(Success).addReg(ZERO).addMBB(loop1MBB);

	MI.eraseFromParent(); // The instruction is gone now.

	return exitMBB;
	}

	MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
	MachineInstr &MI, MachineBasicBlock *BB, unsigned Size) const {
	assert((Size == 1 \|\| Size == 2) &&
	"Unsupported size for EmitAtomicCmpSwapPartial.");

	MachineFunction *MF = BB->getParent();
	MachineRegisterInfo &RegInfo = MF->getRegInfo();
	const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
	const bool ArePtrs64bit = ABI.ArePtrs64bit();
	const TargetRegisterClass *RCp =
	getRegClassFor(ArePtrs64bit ? MVT::i64 : MVT::i32);
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	unsigned Dest = MI.getOperand(0).getReg();
	unsigned Ptr = MI.getOperand(1).getReg();
	unsigned CmpVal = MI.getOperand(2).getReg();
	unsigned NewVal = MI.getOperand(3).getReg();

	unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp);
	unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
	unsigned Mask = RegInfo.createVirtualRegister(RC);
	unsigned Mask2 = RegInfo.createVirtualRegister(RC);
	unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
	unsigned OldVal = RegInfo.createVirtualRegister(RC);
	unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
	unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
	unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
	unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
	unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
	unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
	unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC);
	unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
	unsigned StoreVal = RegInfo.createVirtualRegister(RC);
	unsigned SrlRes = RegInfo.createVirtualRegister(RC);
	unsigned Success = RegInfo.createVirtualRegister(RC);
	unsigned LL, SC;

	if (isMicroMips) {
	LL = Mips::LL_MM;
	SC = Mips::SC_MM;
	} else {
	LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
	: (ArePtrs64bit ? Mips::LL64 : Mips::LL);
	SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
	: (ArePtrs64bit ? Mips::SC64 : Mips::SC);
	}

	// insert new blocks after the current block
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineFunction::iterator It = ++BB->getIterator();
	MF->insert(It, loop1MBB);
	MF->insert(It, loop2MBB);
	MF->insert(It, sinkMBB);
	MF->insert(It, exitMBB);

	// Transfer the remainder of BB and its successor edges to exitMBB.
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	BB->addSuccessor(loop1MBB);
	loop1MBB->addSuccessor(sinkMBB);
	loop1MBB->addSuccessor(loop2MBB);
	loop2MBB->addSuccessor(loop1MBB);
	loop2MBB->addSuccessor(sinkMBB);
	sinkMBB->addSuccessor(exitMBB);

	// FIXME: computation of newval2 can be moved to loop2MBB.
	// thisMBB:
	// addiu masklsb2,$0,-4 # 0xfffffffc
	// and alignedaddr,ptr,masklsb2
	// andi ptrlsb2,ptr,3
	// xori ptrlsb2,ptrlsb2,3 # Only for BE
	// sll shiftamt,ptrlsb2,3
	// ori maskupper,$0,255 # 0xff
	// sll mask,maskupper,shiftamt
	// nor mask2,$0,mask
	// andi maskedcmpval,cmpval,255
	// sll shiftedcmpval,maskedcmpval,shiftamt
	// andi maskednewval,newval,255
	// sll shiftednewval,maskednewval,shiftamt
	int64_t MaskImm = (Size == 1) ? 255 : 65535;
	BuildMI(BB, DL, TII->get(ArePtrs64bit ? Mips::DADDiu : Mips::ADDiu), MaskLSB2)
	.addReg(ABI.GetNullPtr()).addImm(-4);
	BuildMI(BB, DL, TII->get(ArePtrs64bit ? Mips::AND64 : Mips::AND), AlignedAddr)
	.addReg(Ptr).addReg(MaskLSB2);
	BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2)
	.addReg(Ptr, 0, ArePtrs64bit ? Mips::sub_32 : 0).addImm(3);
	if (Subtarget.isLittle()) {
	BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
	} else {
	unsigned Off = RegInfo.createVirtualRegister(RC);
	BuildMI(BB, DL, TII->get(Mips::XORi), Off)
	.addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
	BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
	}
	BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
	.addReg(Mips::ZERO).addImm(MaskImm);
	BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
	.addReg(MaskUpper).addReg(ShiftAmt);
	BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
	BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedCmpVal)
	.addReg(CmpVal).addImm(MaskImm);
	BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedCmpVal)
	.addReg(MaskedCmpVal).addReg(ShiftAmt);
	BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedNewVal)
	.addReg(NewVal).addImm(MaskImm);
	BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedNewVal)
	.addReg(MaskedNewVal).addReg(ShiftAmt);

	// loop1MBB:
	// ll oldval,0(alginedaddr)
	// and maskedoldval0,oldval,mask
	// bne maskedoldval0,shiftedcmpval,sinkMBB
	BB = loop1MBB;
	BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
	BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
	.addReg(OldVal).addReg(Mask);
	BuildMI(BB, DL, TII->get(Mips::BNE))
	.addReg(MaskedOldVal0).addReg(ShiftedCmpVal).addMBB(sinkMBB);

	// loop2MBB:
	// and maskedoldval1,oldval,mask2
	// or storeval,maskedoldval1,shiftednewval
	// sc success,storeval,0(alignedaddr)
	// beq success,$0,loop1MBB
	BB = loop2MBB;
	BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
	.addReg(OldVal).addReg(Mask2);
	BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
	.addReg(MaskedOldVal1).addReg(ShiftedNewVal);
	BuildMI(BB, DL, TII->get(SC), Success)
	.addReg(StoreVal).addReg(AlignedAddr).addImm(0);
	BuildMI(BB, DL, TII->get(Mips::BEQ))
	.addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);

	// sinkMBB:
	// srl srlres,maskedoldval0,shiftamt
	// sign_extend dest,srlres
	BB = sinkMBB;

	BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
	.addReg(MaskedOldVal0).addReg(ShiftAmt);
	BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);

	MI.eraseFromParent(); // The instruction is gone now.

	return exitMBB;
	}

	SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
	// The first operand is the chain, the second is the condition, the third is
	// the block to branch to if the condition is true.
	SDValue Chain = Op.getOperand(0);
	SDValue Dest = Op.getOperand(2);
	SDLoc DL(Op);

	assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
	SDValue CondRes = createFPCmp(DAG, Op.getOperand(1));

	// Return if flag is not set by a floating point comparison.
	if (CondRes.getOpcode() != MipsISD::FPCmp)
	return Op;

	SDValue CCNode = CondRes.getOperand(2);
	Mips::CondCode CC =
	(Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue();
	unsigned Opc = invertFPCondCodeUser(CC) ? Mips::BRANCH_F : Mips::BRANCH_T;
	SDValue BrCode = DAG.getConstant(Opc, DL, MVT::i32);
	SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32);
	return DAG.getNode(MipsISD::FPBrcond, DL, Op.getValueType(), Chain, BrCode,
	FCC0, Dest, CondRes);
	}

	SDValue MipsTargetLowering::
	lowerSELECT(SDValue Op, SelectionDAG &DAG) const
	{
	assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
	SDValue Cond = createFPCmp(DAG, Op.getOperand(0));

	// Return if flag is not set by a floating point comparison.
	if (Cond.getOpcode() != MipsISD::FPCmp)
	return Op;

	return createCMovFP(DAG, Cond, Op.getOperand(1), Op.getOperand(2),
	SDLoc(Op));
	}

	SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const {
	assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
	SDValue Cond = createFPCmp(DAG, Op);

	assert(Cond.getOpcode() == MipsISD::FPCmp &&
	"Floating point operand expected.");

	SDLoc DL(Op);
	SDValue True = DAG.getConstant(1, DL, MVT::i32);
	SDValue False = DAG.getConstant(0, DL, MVT::i32);

	return createCMovFP(DAG, Cond, True, False, DL);
	}

	SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	EVT Ty = Op.getValueType();
	GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
	const GlobalValue *GV = N->getGlobal();

	if (!isPositionIndependent()) {
	const MipsTargetObjectFile *TLOF =
	static_cast<const MipsTargetObjectFile *>(
	getTargetMachine().getObjFileLowering());
	const GlobalObject *GO = GV->getBaseObject();
	if (GO && TLOF->IsGlobalInSmallSection(GO, getTargetMachine()))
	// %gp_rel relocation
	return getAddrGPRel(N, SDLoc(N), Ty, DAG, ABI.IsN64());

	// %hi/%lo relocation
	return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
	// %highest/%higher/%hi/%lo relocation
	: getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
	}

	// Every other architecture would use shouldAssumeDSOLocal in here, but
	// mips is special.
	// * In PIC code mips requires got loads even for local statics!
	// * To save on got entries, for local statics the got entry contains the
	// page and an additional add instruction takes care of the low bits.
	// * It is legal to access a hidden symbol with a non hidden undefined,
	// so one cannot guarantee that all access to a hidden symbol will know
	// it is hidden.
	// * Mips linkers don't support creating a page and a full got entry for
	// the same symbol.
	// * Given all that, we have to use a full got entry for hidden symbols :-(
	if (GV->hasLocalLinkage())
	return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() \|\| ABI.IsN64());

	if (LargeGOT)
	return getAddrGlobalLargeGOT(
	N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16, MipsII::MO_GOT_LO16,
	DAG.getEntryNode(),
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	return getAddrGlobal(
	N, SDLoc(N), Ty, DAG,
	(ABI.IsN32() \|\| ABI.IsN64()) ? MipsII::MO_GOT_DISP : MipsII::MO_GOT,
	DAG.getEntryNode(), MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}

	SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
	EVT Ty = Op.getValueType();

	if (!isPositionIndependent())
	return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
	: getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);

	return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() \|\| ABI.IsN64());
	}

	SDValue MipsTargetLowering::
	lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
	{
	// If the relocation model is PIC, use the General Dynamic TLS Model or
	// Local Dynamic TLS model, otherwise use the Initial Exec or
	// Local Exec TLS Model.

	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	if (DAG.getTarget().Options.EmulatedTLS)
	return LowerToTLSEmulatedModel(GA, DAG);

	SDLoc DL(GA);
	const GlobalValue *GV = GA->getGlobal();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	TLSModel::Model model = getTargetMachine().getTLSModel(GV);

	if (model == TLSModel::GeneralDynamic \|\| model == TLSModel::LocalDynamic) {
	// General Dynamic and Local Dynamic TLS Model.
	unsigned Flag = (model == TLSModel::LocalDynamic) ? MipsII::MO_TLSLDM
	: MipsII::MO_TLSGD;

	SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, Flag);
	SDValue Argument = DAG.getNode(MipsISD::Wrapper, DL, PtrVT,
	getGlobalReg(DAG, PtrVT), TGA);
	unsigned PtrSize = PtrVT.getSizeInBits();
	IntegerType PtrTy = Type::getIntNTy(DAG.getContext(), PtrSize);

	SDValue TlsGetAddr = DAG.getExternalSymbol("__tls_get_addr", PtrVT);

	ArgListTy Args;
	ArgListEntry Entry;
	Entry.Node = Argument;
	Entry.Ty = PtrTy;
	Args.push_back(Entry);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(DL)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args));
	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);

	SDValue Ret = CallResult.first;

	if (model != TLSModel::LocalDynamic)
	return Ret;

	SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
	MipsII::MO_DTPREL_HI);
	SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
	SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
	MipsII::MO_DTPREL_LO);
	SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
	SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Ret);
	return DAG.getNode(ISD::ADD, DL, PtrVT, Add, Lo);
	}

	SDValue Offset;
	if (model == TLSModel::InitialExec) {
	// Initial Exec TLS Model
	SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
	MipsII::MO_GOTTPREL);
	TGA = DAG.getNode(MipsISD::Wrapper, DL, PtrVT, getGlobalReg(DAG, PtrVT),
	TGA);
	Offset =
	DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), TGA, MachinePointerInfo());
	} else {
	// Local Exec TLS Model
	assert(model == TLSModel::LocalExec);
	SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
	MipsII::MO_TPREL_HI);
	SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
	MipsII::MO_TPREL_LO);
	SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
	SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
	Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
	}

	SDValue ThreadPointer = DAG.getNode(MipsISD::ThreadPointer, DL, PtrVT);
	return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadPointer, Offset);
	}

	SDValue MipsTargetLowering::
	lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
	{
	JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
	EVT Ty = Op.getValueType();

	if (!isPositionIndependent())
	return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
	: getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);

	return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() \|\| ABI.IsN64());
	}

	SDValue MipsTargetLowering::
	lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
	{
	ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
	EVT Ty = Op.getValueType();

	if (!isPositionIndependent()) {
	const MipsTargetObjectFile *TLOF =
	static_cast<const MipsTargetObjectFile *>(
	getTargetMachine().getObjFileLowering());

	if (TLOF->IsConstantInSmallSection(DAG.getDataLayout(), N->getConstVal(),
	getTargetMachine()))
	// %gp_rel relocation
	return getAddrGPRel(N, SDLoc(N), Ty, DAG, ABI.IsN64());

	return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
	: getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
	}

	return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() \|\| ABI.IsN64());
	}

	SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();

	SDLoc DL(Op);
	SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
	getPointerTy(MF.getDataLayout()));

	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	SDNode *Node = Op.getNode();
	EVT VT = Node->getValueType(0);
	SDValue Chain = Node->getOperand(0);
	SDValue VAListPtr = Node->getOperand(1);
	unsigned Align = Node->getConstantOperandVal(3);
	const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
	SDLoc DL(Node);
	unsigned ArgSlotSizeInBytes = (ABI.IsN32() \|\| ABI.IsN64()) ? 8 : 4;

	SDValue VAListLoad = DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL, Chain,
	VAListPtr, MachinePointerInfo(SV));
	SDValue VAList = VAListLoad;

	// Re-align the pointer if necessary.
	// It should only ever be necessary for 64-bit types on O32 since the minimum
	// argument alignment is the same as the maximum type alignment for N32/N64.
	//
	// FIXME: We currently align too often. The code generator doesn't notice
	// when the pointer is still aligned from the last va_arg (or pair of
	// va_args for the i64 on O32 case).
	if (Align > getMinStackArgumentAlignment()) {
	assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");

	VAList = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
	DAG.getConstant(Align - 1, DL, VAList.getValueType()));

	VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
	DAG.getConstant(-(int64_t)Align, DL,
	VAList.getValueType()));
	}

	// Increment the pointer, VAList, to the next vaarg.
	auto &TD = DAG.getDataLayout();
	unsigned ArgSizeInBytes =
	TD.getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext()));
	SDValue Tmp3 =
	DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
	DAG.getConstant(alignTo(ArgSizeInBytes, ArgSlotSizeInBytes),
	DL, VAList.getValueType()));
	// Store the incremented VAList to the legalized pointer
	Chain = DAG.getStore(VAListLoad.getValue(1), DL, Tmp3, VAListPtr,
	MachinePointerInfo(SV));

	// In big-endian mode we must adjust the pointer when the load size is smaller
	// than the argument slot size. We must also reduce the known alignment to
	// match. For example in the N64 ABI, we must add 4 bytes to the offset to get
	// the correct half of the slot, and reduce the alignment from 8 (slot
	// alignment) down to 4 (type alignment).
	if (!Subtarget.isLittle() && ArgSizeInBytes < ArgSlotSizeInBytes) {
	unsigned Adjustment = ArgSlotSizeInBytes - ArgSizeInBytes;
	VAList = DAG.getNode(ISD::ADD, DL, VAListPtr.getValueType(), VAList,
	DAG.getIntPtrConstant(Adjustment, DL));
	}
	// Load the actual argument out of the pointer VAList
	return DAG.getLoad(VT, DL, Chain, VAList, MachinePointerInfo());
	}

	static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG,
	bool HasExtractInsert) {
	EVT TyX = Op.getOperand(0).getValueType();
	EVT TyY = Op.getOperand(1).getValueType();
	SDLoc DL(Op);
	SDValue Const1 = DAG.getConstant(1, DL, MVT::i32);
	SDValue Const31 = DAG.getConstant(31, DL, MVT::i32);
	SDValue Res;

	// If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
	// to i32.
	SDValue X = (TyX == MVT::f32) ?
	DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(0)) :
	DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0),
	Const1);
	SDValue Y = (TyY == MVT::f32) ?
	DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(1)) :
	DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(1),
	Const1);

	if (HasExtractInsert) {
	// ext E, Y, 31, 1 ; extract bit31 of Y
	// ins X, E, 31, 1 ; insert extracted bit at bit31 of X
	SDValue E = DAG.getNode(MipsISD::Ext, DL, MVT::i32, Y, Const31, Const1);
	Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32, E, Const31, Const1, X);
	} else {
	// sll SllX, X, 1
	// srl SrlX, SllX, 1
	// srl SrlY, Y, 31
	// sll SllY, SrlX, 31
	// or Or, SrlX, SllY
	SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i32, X, Const1);
	SDValue SrlX = DAG.getNode(ISD::SRL, DL, MVT::i32, SllX, Const1);
	SDValue SrlY = DAG.getNode(ISD::SRL, DL, MVT::i32, Y, Const31);
	SDValue SllY = DAG.getNode(ISD::SHL, DL, MVT::i32, SrlY, Const31);
	Res = DAG.getNode(ISD::OR, DL, MVT::i32, SrlX, SllY);
	}

	if (TyX == MVT::f32)
	return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), Res);

	SDValue LowX = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
	Op.getOperand(0),
	DAG.getConstant(0, DL, MVT::i32));
	return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
	}

	static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG,
	bool HasExtractInsert) {
	unsigned WidthX = Op.getOperand(0).getValueSizeInBits();
	unsigned WidthY = Op.getOperand(1).getValueSizeInBits();
	EVT TyX = MVT::getIntegerVT(WidthX), TyY = MVT::getIntegerVT(WidthY);
	SDLoc DL(Op);
	SDValue Const1 = DAG.getConstant(1, DL, MVT::i32);

	// Bitcast to integer nodes.
	SDValue X = DAG.getNode(ISD::BITCAST, DL, TyX, Op.getOperand(0));
	SDValue Y = DAG.getNode(ISD::BITCAST, DL, TyY, Op.getOperand(1));

	if (HasExtractInsert) {
	// ext E, Y, width(Y) - 1, 1 ; extract bit width(Y)-1 of Y
	// ins X, E, width(X) - 1, 1 ; insert extracted bit at bit width(X)-1 of X
	SDValue E = DAG.getNode(MipsISD::Ext, DL, TyY, Y,
	DAG.getConstant(WidthY - 1, DL, MVT::i32), Const1);

	if (WidthX > WidthY)
	E = DAG.getNode(ISD::ZERO_EXTEND, DL, TyX, E);
	else if (WidthY > WidthX)
	E = DAG.getNode(ISD::TRUNCATE, DL, TyX, E);

	SDValue I = DAG.getNode(MipsISD::Ins, DL, TyX, E,
	DAG.getConstant(WidthX - 1, DL, MVT::i32), Const1,
	X);
	return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), I);
	}

	// (d)sll SllX, X, 1
	// (d)srl SrlX, SllX, 1
	// (d)srl SrlY, Y, width(Y)-1
	// (d)sll SllY, SrlX, width(Y)-1
	// or Or, SrlX, SllY
	SDValue SllX = DAG.getNode(ISD::SHL, DL, TyX, X, Const1);
	SDValue SrlX = DAG.getNode(ISD::SRL, DL, TyX, SllX, Const1);
	SDValue SrlY = DAG.getNode(ISD::SRL, DL, TyY, Y,
	DAG.getConstant(WidthY - 1, DL, MVT::i32));

	if (WidthX > WidthY)
	SrlY = DAG.getNode(ISD::ZERO_EXTEND, DL, TyX, SrlY);
	else if (WidthY > WidthX)
	SrlY = DAG.getNode(ISD::TRUNCATE, DL, TyX, SrlY);

	SDValue SllY = DAG.getNode(ISD::SHL, DL, TyX, SrlY,
	DAG.getConstant(WidthX - 1, DL, MVT::i32));
	SDValue Or = DAG.getNode(ISD::OR, DL, TyX, SrlX, SllY);
	return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), Or);
	}

	SDValue
	MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
	if (Subtarget.isGP64bit())
	return lowerFCOPYSIGN64(Op, DAG, Subtarget.hasExtractInsert());

	return lowerFCOPYSIGN32(Op, DAG, Subtarget.hasExtractInsert());
	}

	SDValue MipsTargetLowering::
	lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
	// check the depth
	assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
	"Frame address can only be determined for current frame.");

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setFrameAddressIsTaken(true);
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue FrameAddr = DAG.getCopyFromReg(
	DAG.getEntryNode(), DL, ABI.IsN64() ? Mips::FP_64 : Mips::FP, VT);
	return FrameAddr;
	}

	SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	// check the depth
	assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
	"Return address can be determined only for current frame.");

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MVT VT = Op.getSimpleValueType();
	unsigned RA = ABI.IsN64() ? Mips::RA_64 : Mips::RA;
	MFI.setReturnAddressIsTaken(true);

	// Return RA, which contains the return address. Mark it an implicit live-in.
	unsigned Reg = MF.addLiveIn(RA, getRegClassFor(VT));
	return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), Reg, VT);
	}

	// An EH_RETURN is the result of lowering llvm.eh.return which in turn is
	// generated from __builtin_eh_return (offset, handler)
	// The effect of this is to adjust the stack pointer by "offset"
	// and then branch to "handler".
	SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
	const {
	MachineFunction &MF = DAG.getMachineFunction();
	MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();

	MipsFI->setCallsEhReturn();
	SDValue Chain = Op.getOperand(0);
	SDValue Offset = Op.getOperand(1);
	SDValue Handler = Op.getOperand(2);
	SDLoc DL(Op);
	EVT Ty = ABI.IsN64() ? MVT::i64 : MVT::i32;

	// Store stack offset in V1, store jump target in V0. Glue CopyToReg and
	// EH_RETURN nodes, so that instructions are emitted back-to-back.
	unsigned OffsetReg = ABI.IsN64() ? Mips::V1_64 : Mips::V1;
	unsigned AddrReg = ABI.IsN64() ? Mips::V0_64 : Mips::V0;
	Chain = DAG.getCopyToReg(Chain, DL, OffsetReg, Offset, SDValue());
	Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1));
	return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain,
	DAG.getRegister(OffsetReg, Ty),
	DAG.getRegister(AddrReg, getPointerTy(MF.getDataLayout())),
	Chain.getValue(1));
	}

	SDValue MipsTargetLowering::lowerATOMIC_FENCE(SDValue Op,
	SelectionDAG &DAG) const {
	// FIXME: Need pseudo-fence for 'singlethread' fences
	// FIXME: Set SType for weaker fences where supported/appropriate.
	unsigned SType = 0;
	SDLoc DL(Op);
	return DAG.getNode(MipsISD::Sync, DL, MVT::Other, Op.getOperand(0),
	DAG.getConstant(SType, DL, MVT::i32));
	}

	SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	MVT VT = Subtarget.isGP64bit() ? MVT::i64 : MVT::i32;

	SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
	SDValue Shamt = Op.getOperand(2);
	// if shamt < (VT.bits):
	// lo = (shl lo, shamt)
	// hi = (or (shl hi, shamt) (srl (srl lo, 1), ~shamt))
	// else:
	// lo = 0
	// hi = (shl lo, shamt[4:0])
	SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
	DAG.getConstant(-1, DL, MVT::i32));
	SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo,
	DAG.getConstant(1, DL, VT));
	SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, Not);
	SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt);
	SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
	SDValue ShiftLeftLo = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
	SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
	DAG.getConstant(VT.getSizeInBits(), DL, MVT::i32));
	Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond,
	DAG.getConstant(0, DL, VT), ShiftLeftLo);
	Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond, ShiftLeftLo, Or);

	SDValue Ops[2] = {Lo, Hi};
	return DAG.getMergeValues(Ops, DL);
	}

	SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
	bool IsSRA) const {
	SDLoc DL(Op);
	SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
	SDValue Shamt = Op.getOperand(2);
	MVT VT = Subtarget.isGP64bit() ? MVT::i64 : MVT::i32;

	// if shamt < (VT.bits):
	// lo = (or (shl (shl hi, 1), ~shamt) (srl lo, shamt))
	// if isSRA:
	// hi = (sra hi, shamt)
	// else:
	// hi = (srl hi, shamt)
	// else:
	// if isSRA:
	// lo = (sra hi, shamt[4:0])
	// hi = (sra hi, 31)
	// else:
	// lo = (srl hi, shamt[4:0])
	// hi = 0
	SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
	DAG.getConstant(-1, DL, MVT::i32));
	SDValue ShiftLeft1Hi = DAG.getNode(ISD::SHL, DL, VT, Hi,
	DAG.getConstant(1, DL, VT));
	SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, ShiftLeft1Hi, Not);
	SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt);
	SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
	SDValue ShiftRightHi = DAG.getNode(IsSRA ? ISD::SRA : ISD::SRL,
	DL, VT, Hi, Shamt);
	SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
	DAG.getConstant(VT.getSizeInBits(), DL, MVT::i32));
	SDValue Ext = DAG.getNode(ISD::SRA, DL, VT, Hi,
	DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
	Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond, ShiftRightHi, Or);
	Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond,
	IsSRA ? Ext : DAG.getConstant(0, DL, VT), ShiftRightHi);

	SDValue Ops[2] = {Lo, Hi};
	return DAG.getMergeValues(Ops, DL);
	}

	static SDValue createLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
	SDValue Chain, SDValue Src, unsigned Offset) {
	SDValue Ptr = LD->getBasePtr();
	EVT VT = LD->getValueType(0), MemVT = LD->getMemoryVT();
	EVT BasePtrVT = Ptr.getValueType();
	SDLoc DL(LD);
	SDVTList VTList = DAG.getVTList(VT, MVT::Other);

	if (Offset)
	Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
	DAG.getConstant(Offset, DL, BasePtrVT));

	SDValue Ops[] = { Chain, Ptr, Src };
	return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, MemVT,
	LD->getMemOperand());
	}

	// Expand an unaligned 32 or 64-bit integer load node.
	SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
	LoadSDNode *LD = cast<LoadSDNode>(Op);
	EVT MemVT = LD->getMemoryVT();

	if (Subtarget.systemSupportsUnalignedAccess())
	return Op;

	// Return if load is aligned or if MemVT is neither i32 nor i64.
	if ((LD->getAlignment() >= MemVT.getSizeInBits() / 8) \|\|
	((MemVT != MVT::i32) && (MemVT != MVT::i64)))
	return SDValue();

	bool IsLittle = Subtarget.isLittle();
	EVT VT = Op.getValueType();
	ISD::LoadExtType ExtType = LD->getExtensionType();
	SDValue Chain = LD->getChain(), Undef = DAG.getUNDEF(VT);

	assert((VT == MVT::i32) \|\| (VT == MVT::i64));

	// Expand
	// (set dst, (i64 (load baseptr)))
	// to
	// (set tmp, (ldl (add baseptr, 7), undef))
	// (set dst, (ldr baseptr, tmp))
	if ((VT == MVT::i64) && (ExtType == ISD::NON_EXTLOAD)) {
	SDValue LDL = createLoadLR(MipsISD::LDL, DAG, LD, Chain, Undef,
	IsLittle ? 7 : 0);
	return createLoadLR(MipsISD::LDR, DAG, LD, LDL.getValue(1), LDL,
	IsLittle ? 0 : 7);
	}

	SDValue LWL = createLoadLR(MipsISD::LWL, DAG, LD, Chain, Undef,
	IsLittle ? 3 : 0);
	SDValue LWR = createLoadLR(MipsISD::LWR, DAG, LD, LWL.getValue(1), LWL,
	IsLittle ? 0 : 3);

	// Expand
	// (set dst, (i32 (load baseptr))) or
	// (set dst, (i64 (sextload baseptr))) or
	// (set dst, (i64 (extload baseptr)))
	// to
	// (set tmp, (lwl (add baseptr, 3), undef))
	// (set dst, (lwr baseptr, tmp))
	if ((VT == MVT::i32) \|\| (ExtType == ISD::SEXTLOAD) \|\|
	(ExtType == ISD::EXTLOAD))
	return LWR;

	assert((VT == MVT::i64) && (ExtType == ISD::ZEXTLOAD));

	// Expand
	// (set dst, (i64 (zextload baseptr)))
	// to
	// (set tmp0, (lwl (add baseptr, 3), undef))
	// (set tmp1, (lwr baseptr, tmp0))
	// (set tmp2, (shl tmp1, 32))
	// (set dst, (srl tmp2, 32))
	SDLoc DL(LD);
	SDValue Const32 = DAG.getConstant(32, DL, MVT::i32);
	SDValue SLL = DAG.getNode(ISD::SHL, DL, MVT::i64, LWR, Const32);
	SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i64, SLL, Const32);
	SDValue Ops[] = { SRL, LWR.getValue(1) };
	return DAG.getMergeValues(Ops, DL);
	}

	static SDValue createStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
	SDValue Chain, unsigned Offset) {
	SDValue Ptr = SD->getBasePtr(), Value = SD->getValue();
	EVT MemVT = SD->getMemoryVT(), BasePtrVT = Ptr.getValueType();
	SDLoc DL(SD);
	SDVTList VTList = DAG.getVTList(MVT::Other);

	if (Offset)
	Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
	DAG.getConstant(Offset, DL, BasePtrVT));

	SDValue Ops[] = { Chain, Value, Ptr };
	return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, MemVT,
	SD->getMemOperand());
	}

	// Expand an unaligned 32 or 64-bit integer store node.
	static SDValue lowerUnalignedIntStore(StoreSDNode *SD, SelectionDAG &DAG,
	bool IsLittle) {
	SDValue Value = SD->getValue(), Chain = SD->getChain();
	EVT VT = Value.getValueType();

	// Expand
	// (store val, baseptr) or
	// (truncstore val, baseptr)
	// to
	// (swl val, (add baseptr, 3))
	// (swr val, baseptr)
	if ((VT == MVT::i32) \|\| SD->isTruncatingStore()) {
	SDValue SWL = createStoreLR(MipsISD::SWL, DAG, SD, Chain,
	IsLittle ? 3 : 0);
	return createStoreLR(MipsISD::SWR, DAG, SD, SWL, IsLittle ? 0 : 3);
	}

	assert(VT == MVT::i64);

	// Expand
	// (store val, baseptr)
	// to
	// (sdl val, (add baseptr, 7))
	// (sdr val, baseptr)
	SDValue SDL = createStoreLR(MipsISD::SDL, DAG, SD, Chain, IsLittle ? 7 : 0);
	return createStoreLR(MipsISD::SDR, DAG, SD, SDL, IsLittle ? 0 : 7);
	}

	// Lower (store (fp_to_sint $fp) $ptr) to (store (TruncIntFP $fp), $ptr).
	static SDValue lowerFP_TO_SINT_STORE(StoreSDNode *SD, SelectionDAG &DAG) {
	SDValue Val = SD->getValue();

	if (Val.getOpcode() != ISD::FP_TO_SINT)
	return SDValue();

	EVT FPTy = EVT::getFloatingPointVT(Val.getValueSizeInBits());
	SDValue Tr = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Val), FPTy,
	Val.getOperand(0));
	return DAG.getStore(SD->getChain(), SDLoc(SD), Tr, SD->getBasePtr(),
	SD->getPointerInfo(), SD->getAlignment(),
	SD->getMemOperand()->getFlags());
	}

	SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
	StoreSDNode *SD = cast<StoreSDNode>(Op);
	EVT MemVT = SD->getMemoryVT();

	// Lower unaligned integer stores.
	if (!Subtarget.systemSupportsUnalignedAccess() &&
	(SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
	((MemVT == MVT::i32) \|\| (MemVT == MVT::i64)))
	return lowerUnalignedIntStore(SD, DAG, Subtarget.isLittle());

	return lowerFP_TO_SINT_STORE(SD, DAG);
	}

	SDValue MipsTargetLowering::lowerEH_DWARF_CFA(SDValue Op,
	SelectionDAG &DAG) const {

	// Return a fixed StackObject with offset 0 which points to the old stack
	// pointer.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	EVT ValTy = Op->getValueType(0);
	int FI = MFI.CreateFixedObject(Op.getValueSizeInBits() / 8, 0, false);
	return DAG.getFrameIndex(FI, ValTy);
	}

	SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
	SelectionDAG &DAG) const {
	EVT FPTy = EVT::getFloatingPointVT(Op.getValueSizeInBits());
	SDValue Trunc = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Op), FPTy,
	Op.getOperand(0));
	return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc);
	}

	//===----------------------------------------------------------------------===//
	// Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// TODO: Implement a generic logic using tblgen that can support this.
	// Mips O32 ABI rules:
	// ---
	// i32 - Passed in A0, A1, A2, A3 and stack
	// f32 - Only passed in f32 registers if no int reg has been used yet to hold
	// an argument. Otherwise, passed in A1, A2, A3 and stack.
	// f64 - Only passed in two aliased f32 registers if no int reg has been used
	// yet to hold an argument. Otherwise, use A2, A3 and stack. If A1 is
	// not used, it must be shadowed. If only A3 is available, shadow it and
	// go to stack.
	// vXiX - Received as scalarized i32s, passed in A0 - A3 and the stack.
	// vXf32 - Passed in either a pair of registers {A0, A1}, {A2, A3} or {A0 - A3}
	// with the remainder spilled to the stack.
	// vXf64 - Passed in either {A0, A1, A2, A3} or {A2, A3} and in both cases
	// spilling the remainder to the stack.
	//
	// For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack.
	//===----------------------------------------------------------------------===//

	static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
	CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
	CCState &State, ArrayRef<MCPhysReg> F64Regs) {
	const MipsSubtarget &Subtarget = static_cast<const MipsSubtarget &>(
	State.getMachineFunction().getSubtarget());

	static const MCPhysReg IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };

	const MipsCCState * MipsState = static_cast<MipsCCState *>(&State);

	static const MCPhysReg F32Regs[] = { Mips::F12, Mips::F14 };

	static const MCPhysReg FloatVectorIntRegs[] = { Mips::A0, Mips::A2 };

	// Do not process byval args here.
	if (ArgFlags.isByVal())
	return true;

	// Promote i8 and i16
	if (ArgFlags.isInReg() && !Subtarget.isLittle()) {
	if (LocVT == MVT::i8 \|\| LocVT == MVT::i16 \|\| LocVT == MVT::i32) {
	LocVT = MVT::i32;
	if (ArgFlags.isSExt())
	LocInfo = CCValAssign::SExtUpper;
	else if (ArgFlags.isZExt())
	LocInfo = CCValAssign::ZExtUpper;
	else
	LocInfo = CCValAssign::AExtUpper;
	}
	}

	// Promote i8 and i16
	if (LocVT == MVT::i8 \|\| LocVT == MVT::i16) {
	LocVT = MVT::i32;
	if (ArgFlags.isSExt())
	LocInfo = CCValAssign::SExt;
	else if (ArgFlags.isZExt())
	LocInfo = CCValAssign::ZExt;
	else
	LocInfo = CCValAssign::AExt;
	}

	unsigned Reg;

	// f32 and f64 are allocated in A0, A1, A2, A3 when either of the following
	// is true: function is vararg, argument is 3rd or higher, there is previous
	// argument which is not f32 or f64.
	bool AllocateFloatsInIntReg = State.isVarArg() \|\| ValNo > 1 \|\|
	State.getFirstUnallocated(F32Regs) != ValNo;
	unsigned OrigAlign = ArgFlags.getOrigAlign();
	bool isI64 = (ValVT == MVT::i32 && OrigAlign == 8);
	bool isVectorFloat = MipsState->WasOriginalArgVectorFloat(ValNo);

	// The MIPS vector ABI for floats passes them in a pair of registers
	if (ValVT == MVT::i32 && isVectorFloat) {
	// This is the start of an vector that was scalarized into an unknown number
	// of components. It doesn't matter how many there are. Allocate one of the
	// notional 8 byte aligned registers which map onto the argument stack, and
	// shadow the register lost to alignment requirements.
	if (ArgFlags.isSplit()) {
	Reg = State.AllocateReg(FloatVectorIntRegs);
	if (Reg == Mips::A2)
	State.AllocateReg(Mips::A1);
	else if (Reg == 0)
	State.AllocateReg(Mips::A3);
	} else {
	// If we're an intermediate component of the split, we can just attempt to
	// allocate a register directly.
	Reg = State.AllocateReg(IntRegs);
	}
	} else if (ValVT == MVT::i32 \|\| (ValVT == MVT::f32 && AllocateFloatsInIntReg)) {
	Reg = State.AllocateReg(IntRegs);
	// If this is the first part of an i64 arg,
	// the allocated register must be either A0 or A2.
	if (isI64 && (Reg == Mips::A1 \|\| Reg == Mips::A3))
	Reg = State.AllocateReg(IntRegs);
	LocVT = MVT::i32;
	} else if (ValVT == MVT::f64 && AllocateFloatsInIntReg) {
	// Allocate int register and shadow next int register. If first
	// available register is Mips::A1 or Mips::A3, shadow it too.
	Reg = State.AllocateReg(IntRegs);
	if (Reg == Mips::A1 \|\| Reg == Mips::A3)
	Reg = State.AllocateReg(IntRegs);
	State.AllocateReg(IntRegs);
	LocVT = MVT::i32;
	} else if (ValVT.isFloatingPoint() && !AllocateFloatsInIntReg) {
	// we are guaranteed to find an available float register
	if (ValVT == MVT::f32) {
	Reg = State.AllocateReg(F32Regs);
	// Shadow int register
	State.AllocateReg(IntRegs);
	} else {
	Reg = State.AllocateReg(F64Regs);
	// Shadow int registers
	unsigned Reg2 = State.AllocateReg(IntRegs);
	if (Reg2 == Mips::A1 \|\| Reg2 == Mips::A3)
	State.AllocateReg(IntRegs);
	State.AllocateReg(IntRegs);
	}
	} else
	llvm_unreachable("Cannot handle this ValVT.");

	if (!Reg) {
	unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), OrigAlign);
	State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
	} else
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));

	return false;
	}

	static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT,
	MVT LocVT, CCValAssign::LocInfo LocInfo,
	ISD::ArgFlagsTy ArgFlags, CCState &State) {
	static const MCPhysReg F64Regs[] = { Mips::D6, Mips::D7 };

	return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
	}

	static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT,
	MVT LocVT, CCValAssign::LocInfo LocInfo,
	ISD::ArgFlagsTy ArgFlags, CCState &State) {
	static const MCPhysReg F64Regs[] = { Mips::D12_64, Mips::D14_64 };

	return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
	}

	static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
	CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
	CCState &State) LLVM_ATTRIBUTE_UNUSED;

	#include "MipsGenCallingConv.inc"

	//===----------------------------------------------------------------------===//
	// Call Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	// Return next O32 integer argument register.
	static unsigned getNextIntArgReg(unsigned Reg) {
	assert((Reg == Mips::A0) \|\| (Reg == Mips::A2));
	return (Reg == Mips::A0) ? Mips::A1 : Mips::A3;
	}

	SDValue MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
	SDValue Chain, SDValue Arg,
	const SDLoc &DL, bool IsTailCall,
	SelectionDAG &DAG) const {
	if (!IsTailCall) {
	SDValue PtrOff =
	DAG.getNode(ISD::ADD, DL, getPointerTy(DAG.getDataLayout()), StackPtr,
	DAG.getIntPtrConstant(Offset, DL));
	return DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo());
	}

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	int FI = MFI.CreateFixedObject(Arg.getValueSizeInBits() / 8, Offset, false);
	SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(),
	/* Alignment = */ 0, MachineMemOperand::MOVolatile);
	}

	void MipsTargetLowering::
	getOpndList(SmallVectorImpl<SDValue> &Ops,
	std::deque<std::pair<unsigned, SDValue>> &RegsToPass,
	bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
	bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
	SDValue Chain) const {
	// Insert node "GP copy globalreg" before call to function.
	//
	// R_MIPS_CALL* operators (emitted when non-internal functions are called
	// in PIC mode) allow symbols to be resolved via lazy binding.
	// The lazy binding stub requires GP to point to the GOT.
	// Note that we don't need GP to point to the GOT for indirect calls
	// (when R_MIPS_CALL* is not used for the call) because Mips linker generates
	// lazy binding stub for a function only when R_MIPS_CALL* are the only relocs
	// used for the function (that is, Mips linker doesn't generate lazy binding
	// stub for a function whose address is taken in the program).
	if (IsPICCall && !InternalLinkage && IsCallReloc) {
	unsigned GPReg = ABI.IsN64() ? Mips::GP_64 : Mips::GP;
	EVT Ty = ABI.IsN64() ? MVT::i64 : MVT::i32;
	RegsToPass.push_back(std::make_pair(GPReg, getGlobalReg(CLI.DAG, Ty)));
	}

	// Build a sequence of copy-to-reg nodes chained together with token
	// chain and flag operands which copy the outgoing args into registers.
	// The InFlag in necessary since all emitted instructions must be
	// stuck together.
	SDValue InFlag;

	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = CLI.DAG.getCopyToReg(Chain, CLI.DL, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	// Add argument registers to the end of the list so that they are
	// known live into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(CLI.DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *Mask =
	TRI->getCallPreservedMask(CLI.DAG.getMachineFunction(), CLI.CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");
	if (Subtarget.inMips16HardFloat()) {
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(CLI.Callee)) {
	StringRef Sym = G->getGlobal()->getName();
	Function *F = G->getGlobal()->getParent()->getFunction(Sym);
	if (F && F->hasFnAttribute("__Mips16RetHelper")) {
	Mask = MipsRegisterInfo::getMips16RetHelperMask();
	}
	}
	}
	Ops.push_back(CLI.DAG.getRegisterMask(Mask));

	if (InFlag.getNode())
	Ops.push_back(InFlag);
	}

	/// LowerCall - functions arguments are copied from virtual regs to
	/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
	SDValue
	MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc DL = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &IsTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool IsVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const TargetFrameLowering *TFL = Subtarget.getFrameLowering();
	MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
	bool IsPIC = isPositionIndependent();

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	MipsCCState CCInfo(
	CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext(),
	MipsCCState::getSpecialCallingConvForCallee(Callee.getNode(), Subtarget));

	// Allocate the reserved argument area. It seems strange to do this from the
	// caller side but removing it breaks the frame size calculation.
	CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);

	const ExternalSymbolSDNode *ES =
	dyn_cast_or_null<const ExternalSymbolSDNode>(Callee.getNode());
	CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(),
	ES ? ES->getSymbol() : nullptr);

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NextStackOffset = CCInfo.getNextStackOffset();

	// Check if it's really possible to do a tail call. Restrict it to functions
	// that are part of this compilation unit.
	bool InternalLinkage = false;
	if (IsTailCall) {
	IsTailCall = isEligibleForTailCallOptimization(
	CCInfo, NextStackOffset, *MF.getInfo<MipsFunctionInfo>());
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	InternalLinkage = G->getGlobal()->hasInternalLinkage();
	IsTailCall &= (InternalLinkage \|\| G->getGlobal()->hasLocalLinkage() \|\|
	G->getGlobal()->hasPrivateLinkage() \|\|
	G->getGlobal()->hasHiddenVisibility() \|\|
	G->getGlobal()->hasProtectedVisibility());
	}
	}
	if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");

	if (IsTailCall)
	++NumTailCalls;

	// Chain is the output chain of the last Load/Store or CopyToReg node.
	// ByValChain is the output chain of the last Memcpy node created for copying
	// byval arguments to the stack.
	unsigned StackAlignment = TFL->getStackAlignment();
	NextStackOffset = alignTo(NextStackOffset, StackAlignment);
	SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, DL, true);

	if (!IsTailCall)
	Chain = DAG.getCALLSEQ_START(Chain, NextStackOffset, 0, DL);

	SDValue StackPtr =
	DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP,
	getPointerTy(DAG.getDataLayout()));

	std::deque<std::pair<unsigned, SDValue>> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;

	CCInfo.rewindByValRegsInfo();

	// Walk the register/memloc assignments, inserting copies/loads.
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	SDValue Arg = OutVals[i];
	CCValAssign &VA = ArgLocs[i];
	MVT ValVT = VA.getValVT(), LocVT = VA.getLocVT();
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	bool UseUpperBits = false;

	// ByVal Arg.
	if (Flags.isByVal()) {
	unsigned FirstByValReg, LastByValReg;
	unsigned ByValIdx = CCInfo.getInRegsParamsProcessed();
	CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg);

	assert(Flags.getByValSize() &&
	"ByVal args of size 0 should have been ignored by front-end.");
	assert(ByValIdx < CCInfo.getInRegsParamsCount());
	assert(!IsTailCall &&
	"Do not tail-call optimize if there is a byval argument.");
	passByValArg(Chain, DL, RegsToPass, MemOpChains, StackPtr, MFI, DAG, Arg,
	FirstByValReg, LastByValReg, Flags, Subtarget.isLittle(),
	VA);
	CCInfo.nextInRegsParam();
	continue;
	}

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	if (VA.isRegLoc()) {
	if ((ValVT == MVT::f32 && LocVT == MVT::i32) \|\|
	(ValVT == MVT::f64 && LocVT == MVT::i64) \|\|
	(ValVT == MVT::i64 && LocVT == MVT::f64))
	Arg = DAG.getNode(ISD::BITCAST, DL, LocVT, Arg);
	else if (ValVT == MVT::f64 && LocVT == MVT::i32) {
	SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
	Arg, DAG.getConstant(0, DL, MVT::i32));
	SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
	Arg, DAG.getConstant(1, DL, MVT::i32));
	if (!Subtarget.isLittle())
	std::swap(Lo, Hi);
	unsigned LocRegLo = VA.getLocReg();
	unsigned LocRegHigh = getNextIntArgReg(LocRegLo);
	RegsToPass.push_back(std::make_pair(LocRegLo, Lo));
	RegsToPass.push_back(std::make_pair(LocRegHigh, Hi));
	continue;
	}
	}
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, LocVT, Arg);
	break;
	case CCValAssign::SExtUpper:
	UseUpperBits = true;
	LLVM_FALLTHROUGH;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, LocVT, Arg);
	break;
	case CCValAssign::ZExtUpper:
	UseUpperBits = true;
	LLVM_FALLTHROUGH;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, LocVT, Arg);
	break;
	case CCValAssign::AExtUpper:
	UseUpperBits = true;
	LLVM_FALLTHROUGH;
	case CCValAssign::AExt:
	Arg = DAG.getNode(ISD::ANY_EXTEND, DL, LocVT, Arg);
	break;
	}

	if (UseUpperBits) {
	unsigned ValSizeInBits = Outs[i].ArgVT.getSizeInBits();
	unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
	Arg = DAG.getNode(
	ISD::SHL, DL, VA.getLocVT(), Arg,
	DAG.getConstant(LocSizeInBits - ValSizeInBits, DL, VA.getLocVT()));
	}

	// Arguments that can be passed on register must be kept at
	// RegsToPass vector
	if (VA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	continue;
	}

	// Register can't get to this point...
	assert(VA.isMemLoc());

	// emit ISD::STORE whichs stores the
	// parameter value to a stack Location
	MemOpChains.push_back(passArgOnStack(StackPtr, VA.getLocMemOffset(),
	Chain, Arg, DL, IsTailCall, DAG));
	}

	// Transform all store nodes into one single node because all store
	// nodes are independent of each other.
	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

	// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
	// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
	// node so that legalize doesn't hack it.

	EVT Ty = Callee.getValueType();
	bool GlobalOrExternal = false, IsCallReloc = false;

	// The long-calls feature is ignored in case of PIC.
	// While we do not support -mshared / -mno-shared properly,
	// ignore long-calls in case of -mabicalls too.
	if (!Subtarget.isABICalls() && !IsPIC) {
	// If the function should be called using "long call",
	// get its address into a register to prevent using
	// of the `jal` instruction for the direct call.
	if (auto *N = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	if (Subtarget.useLongCalls())
	Callee = Subtarget.hasSym32()
	? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
	: getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
	} else if (auto *N = dyn_cast<GlobalAddressSDNode>(Callee)) {
	bool UseLongCalls = Subtarget.useLongCalls();
	// If the function has long-call/far/near attribute
	// it overrides command line switch pased to the backend.
	if (auto *F = dyn_cast<Function>(N->getGlobal())) {
	if (F->hasFnAttribute("long-call"))
	UseLongCalls = true;
	else if (F->hasFnAttribute("short-call"))
	UseLongCalls = false;
	}
	if (UseLongCalls)
	Callee = Subtarget.hasSym32()
	? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
	: getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
	}
	}

	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	if (IsPIC) {
	const GlobalValue *Val = G->getGlobal();
	InternalLinkage = Val->hasInternalLinkage();

	if (InternalLinkage)
	Callee = getAddrLocal(G, DL, Ty, DAG, ABI.IsN32() \|\| ABI.IsN64());
	else if (LargeGOT) {
	Callee = getAddrGlobalLargeGOT(G, DL, Ty, DAG, MipsII::MO_CALL_HI16,
	MipsII::MO_CALL_LO16, Chain,
	FuncInfo->callPtrInfo(Val));
	IsCallReloc = true;
	} else {
	Callee = getAddrGlobal(G, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
	FuncInfo->callPtrInfo(Val));
	IsCallReloc = true;
	}
	} else
	Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL,
	getPointerTy(DAG.getDataLayout()), 0,
	MipsII::MO_NO_FLAG);
	GlobalOrExternal = true;
	}
	else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	const char *Sym = S->getSymbol();

	if (!IsPIC) // static
	Callee = DAG.getTargetExternalSymbol(
	Sym, getPointerTy(DAG.getDataLayout()), MipsII::MO_NO_FLAG);
	else if (LargeGOT) {
	Callee = getAddrGlobalLargeGOT(S, DL, Ty, DAG, MipsII::MO_CALL_HI16,
	MipsII::MO_CALL_LO16, Chain,
	FuncInfo->callPtrInfo(Sym));
	IsCallReloc = true;
	} else { // PIC
	Callee = getAddrGlobal(S, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
	FuncInfo->callPtrInfo(Sym));
	IsCallReloc = true;
	}

	GlobalOrExternal = true;
	}

	SmallVector<SDValue, 8> Ops(1, Chain);
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	getOpndList(Ops, RegsToPass, IsPIC, GlobalOrExternal, InternalLinkage,
	IsCallReloc, CLI, Callee, Chain);

	if (IsTailCall) {
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(MipsISD::TailCall, DL, MVT::Other, Ops);
	}

	Chain = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, Ops);
	SDValue InFlag = Chain.getValue(1);

	// Create the CALLSEQ_END node.
	Chain = DAG.getCALLSEQ_END(Chain, NextStackOffsetVal,
	DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
	InFlag = Chain.getValue(1);

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
	InVals, CLI);
	}

	/// LowerCallResult - Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	SDValue MipsTargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	TargetLowering::CallLoweringInfo &CLI) const {
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	const ExternalSymbolSDNode *ES =
	dyn_cast_or_null<const ExternalSymbolSDNode>(CLI.Callee.getNode());
	CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI.RetTy,
	ES ? ES->getSymbol() : nullptr);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");

	SDValue Val = DAG.getCopyFromReg(Chain, DL, RVLocs[i].getLocReg(),
	RVLocs[i].getLocVT(), InFlag);
	Chain = Val.getValue(1);
	InFlag = Val.getValue(2);

	if (VA.isUpperBitsInLoc()) {
	unsigned ValSizeInBits = Ins[i].ArgVT.getSizeInBits();
	unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
	unsigned Shift =
	VA.getLocInfo() == CCValAssign::ZExtUpper ? ISD::SRL : ISD::SRA;
	Val = DAG.getNode(
	Shift, DL, VA.getLocVT(), Val,
	DAG.getConstant(LocSizeInBits - ValSizeInBits, DL, VA.getLocVT()));
	}

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
	break;
	case CCValAssign::AExt:
	case CCValAssign::AExtUpper:
	Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
	break;
	case CCValAssign::ZExt:
	case CCValAssign::ZExtUpper:
	Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
	DAG.getValueType(VA.getValVT()));
	Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
	break;
	case CCValAssign::SExt:
	case CCValAssign::SExtUpper:
	Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
	DAG.getValueType(VA.getValVT()));
	Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
	break;
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	static SDValue UnpackFromArgumentSlot(SDValue Val, const CCValAssign &VA,
	EVT ArgVT, const SDLoc &DL,
	SelectionDAG &DAG) {
	MVT LocVT = VA.getLocVT();
	EVT ValVT = VA.getValVT();

	// Shift into the upper bits if necessary.
	switch (VA.getLocInfo()) {
	default:
	break;
	case CCValAssign::AExtUpper:
	case CCValAssign::SExtUpper:
	case CCValAssign::ZExtUpper: {
	unsigned ValSizeInBits = ArgVT.getSizeInBits();
	unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
	unsigned Opcode =
	VA.getLocInfo() == CCValAssign::ZExtUpper ? ISD::SRL : ISD::SRA;
	Val = DAG.getNode(
	Opcode, DL, VA.getLocVT(), Val,
	DAG.getConstant(LocSizeInBits - ValSizeInBits, DL, VA.getLocVT()));
	break;
	}
	}

	// If this is an value smaller than the argument slot size (32-bit for O32,
	// 64-bit for N32/N64), it has been promoted in some way to the argument slot
	// size. Extract the value and insert any appropriate assertions regarding
	// sign/zero extension.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::AExtUpper:
	case CCValAssign::AExt:
	Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
	break;
	case CCValAssign::SExtUpper:
	case CCValAssign::SExt:
	Val = DAG.getNode(ISD::AssertSext, DL, LocVT, Val, DAG.getValueType(ValVT));
	Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
	break;
	case CCValAssign::ZExtUpper:
	case CCValAssign::ZExt:
	Val = DAG.getNode(ISD::AssertZext, DL, LocVT, Val, DAG.getValueType(ValVT));
	Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
	break;
	case CCValAssign::BCvt:
	Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
	break;
	}

	return Val;
	}

	//===----------------------------------------------------------------------===//
	// Formal Arguments Calling Convention Implementation
	//===----------------------------------------------------------------------===//
	/// LowerFormalArguments - transform physical registers into virtual registers
	/// and generate load operations for arguments places on the stack.
	SDValue MipsTargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();

	MipsFI->setVarArgsFrameIndex(0);

	// Used with vargs to acumulate store chains.
	std::vector<SDValue> OutChains;

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());
	CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
	const Function &Func = DAG.getMachineFunction().getFunction();
	Function::const_arg_iterator FuncArg = Func.arg_begin();

	if (Func.hasFnAttribute("interrupt") && !Func.arg_empty())
	report_fatal_error(
	"Functions with the interrupt attribute cannot have arguments!");

	CCInfo.AnalyzeFormalArguments(Ins, CC_Mips_FixedArg);
	MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
	CCInfo.getInRegsParamsCount() > 0);

	unsigned CurArgIdx = 0;
	CCInfo.rewindByValRegsInfo();

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	if (Ins[i].isOrigArg()) {
	std::advance(FuncArg, Ins[i].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[i].getOrigArgIndex();
	}
	EVT ValVT = VA.getValVT();
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	bool IsRegLoc = VA.isRegLoc();

	if (Flags.isByVal()) {
	assert(Ins[i].isOrigArg() && "Byval arguments cannot be implicit");
	unsigned FirstByValReg, LastByValReg;
	unsigned ByValIdx = CCInfo.getInRegsParamsProcessed();
	CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg);

	assert(Flags.getByValSize() &&
	"ByVal args of size 0 should have been ignored by front-end.");
	assert(ByValIdx < CCInfo.getInRegsParamsCount());
	copyByValRegs(Chain, DL, OutChains, DAG, Flags, InVals, &*FuncArg,
	FirstByValReg, LastByValReg, VA, CCInfo);
	CCInfo.nextInRegsParam();
	continue;
	}

	// Arguments stored on registers
	if (IsRegLoc) {
	MVT RegVT = VA.getLocVT();
	unsigned ArgReg = VA.getLocReg();
	const TargetRegisterClass *RC = getRegClassFor(RegVT);

	// Transform the arguments stored on
	// physical registers into virtual ones
	unsigned Reg = addLiveIn(DAG.getMachineFunction(), ArgReg, RC);
	SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);

	ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG);

	// Handle floating point arguments passed in integer registers and
	// long double arguments passed in floating point registers.
	if ((RegVT == MVT::i32 && ValVT == MVT::f32) \|\|
	(RegVT == MVT::i64 && ValVT == MVT::f64) \|\|
	(RegVT == MVT::f64 && ValVT == MVT::i64))
	ArgValue = DAG.getNode(ISD::BITCAST, DL, ValVT, ArgValue);
	else if (ABI.IsO32() && RegVT == MVT::i32 &&
	ValVT == MVT::f64) {
	unsigned Reg2 = addLiveIn(DAG.getMachineFunction(),
	getNextIntArgReg(ArgReg), RC);
	SDValue ArgValue2 = DAG.getCopyFromReg(Chain, DL, Reg2, RegVT);
	if (!Subtarget.isLittle())
	std::swap(ArgValue, ArgValue2);
	ArgValue = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64,
	ArgValue, ArgValue2);
	}

	InVals.push_back(ArgValue);
	} else { // VA.isRegLoc()
	MVT LocVT = VA.getLocVT();

	if (ABI.IsO32()) {
	// We ought to be able to use LocVT directly but O32 sets it to i32
	// when allocating floating point values to integer registers.
	// This shouldn't influence how we load the value into registers unless
	// we are targeting softfloat.
	if (VA.getValVT().isFloatingPoint() && !Subtarget.useSoftFloat())
	LocVT = VA.getValVT();
	}

	// sanity check
	assert(VA.isMemLoc());

	// The stack pointer offset is relative to the caller stack frame.
	int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), true);

	// Create load nodes to retrieve arguments from the stack
	SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	SDValue ArgValue = DAG.getLoad(
	LocVT, DL, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	OutChains.push_back(ArgValue.getValue(1));

	ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG);

	InVals.push_back(ArgValue);
	}
	}

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	// The mips ABIs for returning structs by value requires that we copy
	// the sret argument into $v0 for the return. Save the argument into
	// a virtual register so that we can access it from the return points.
	if (Ins[i].Flags.isSRet()) {
	unsigned Reg = MipsFI->getSRetReturnReg();
	if (!Reg) {
	Reg = MF.getRegInfo().createVirtualRegister(
	getRegClassFor(ABI.IsN64() ? MVT::i64 : MVT::i32));
	MipsFI->setSRetReturnReg(Reg);
	}
	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[i]);
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
	break;
	}
	}

	if (IsVarArg)
	writeVarArgRegs(OutChains, Chain, DL, DAG, CCInfo);

	// All stores are grouped in one node to allow the matching between
	// the size of Ins and InVals. This only happens when on varg functions
	if (!OutChains.empty()) {
	OutChains.push_back(Chain);
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// Return Value Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	bool
	MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
	MachineFunction &MF, bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	MipsCCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC_Mips);
	}

	bool
	MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
	- if (Subtarget.hasMips3() && Subtarget.useSoftFloat()) {
	- if (Type == MVT::i32)
	+ if ((ABI.IsN32() \|\| ABI.IsN64()) && Type == MVT::i32)
	return true;
	- }
	+
	return IsSigned;
	}

	SDValue
	MipsTargetLowering::LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
	const SDLoc &DL,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();

	MipsFI->setISR();

	return DAG.getNode(MipsISD::ERet, DL, MVT::Other, RetOps);
	}

	SDValue
	MipsTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	// CCValAssign - represent the assignment of
	// the return value to a location
	SmallVector<CCValAssign, 16> RVLocs;
	MachineFunction &MF = DAG.getMachineFunction();

	// CCState - Info about the registers and stack slot.
	MipsCCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());

	// Analyze return values.
	CCInfo.AnalyzeReturn(Outs, RetCC_Mips);

	SDValue Flag;
	SmallVector<SDValue, 4> RetOps(1, Chain);

	// Copy the result values into the output registers.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	SDValue Val = OutVals[i];
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");
	bool UseUpperBits = false;

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	Val = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Val);
	break;
	case CCValAssign::AExtUpper:
	UseUpperBits = true;
	LLVM_FALLTHROUGH;
	case CCValAssign::AExt:
	Val = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Val);
	break;
	case CCValAssign::ZExtUpper:
	UseUpperBits = true;
	LLVM_FALLTHROUGH;
	case CCValAssign::ZExt:
	Val = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Val);
	break;
	case CCValAssign::SExtUpper:
	UseUpperBits = true;
	LLVM_FALLTHROUGH;
	case CCValAssign::SExt:
	Val = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Val);
	break;
	}

	if (UseUpperBits) {
	unsigned ValSizeInBits = Outs[i].ArgVT.getSizeInBits();
	unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
	Val = DAG.getNode(
	ISD::SHL, DL, VA.getLocVT(), Val,
	DAG.getConstant(LocSizeInBits - ValSizeInBits, DL, VA.getLocVT()));
	}

	Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Flag);

	// Guarantee that all emitted copies are stuck together with flags.
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}

	// The mips ABIs for returning structs by value requires that we copy
	// the sret argument into $v0 for the return. We saved the argument into
	// a virtual register in the entry block, so now we copy the value out
	// and into $v0.
	if (MF.getFunction().hasStructRetAttr()) {
	MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
	unsigned Reg = MipsFI->getSRetReturnReg();

	if (!Reg)
	llvm_unreachable("sret virtual register not created in the entry block");
	SDValue Val =
	DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(DAG.getDataLayout()));
	unsigned V0 = ABI.IsN64() ? Mips::V0_64 : Mips::V0;

	Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(V0, getPointerTy(DAG.getDataLayout())));
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	// ISRs must use "eret".
	if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt"))
	return LowerInterruptReturn(RetOps, DL, DAG);

	// Standard return on Mips is a "jr $ra"
	return DAG.getNode(MipsISD::Ret, DL, MVT::Other, RetOps);
	}

	//===----------------------------------------------------------------------===//
	// Mips Inline Assembly Support
	//===----------------------------------------------------------------------===//

	/// getConstraintType - Given a constraint letter, return the type of
	/// constraint it is for this target.
	MipsTargetLowering::ConstraintType
	MipsTargetLowering::getConstraintType(StringRef Constraint) const {
	// Mips specific constraints
	// GCC config/mips/constraints.md
	//
	// 'd' : An address register. Equivalent to r
	// unless generating MIPS16 code.
	// 'y' : Equivalent to r; retained for
	// backwards compatibility.
	// 'c' : A register suitable for use in an indirect
	// jump. This will always be $25 for -mabicalls.
	// 'l' : The lo register. 1 word storage.
	// 'x' : The hilo register pair. Double word storage.
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default : break;
	case 'd':
	case 'y':
	case 'f':
	case 'c':
	case 'l':
	case 'x':
	return C_RegisterClass;
	case 'R':
	return C_Memory;
	}
	}

	if (Constraint == "ZC")
	return C_Memory;

	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	MipsTargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;
	case 'd':
	case 'y':
	if (type->isIntegerTy())
	weight = CW_Register;
	break;
	case 'f': // FPU or MSA register
	if (Subtarget.hasMSA() && type->isVectorTy() &&
	cast<VectorType>(type)->getBitWidth() == 128)
	weight = CW_Register;
	else if (type->isFloatTy())
	weight = CW_Register;
	break;
	case 'c': // $25 for indirect jumps
	case 'l': // lo register
	case 'x': // hilo register pair
	if (type->isIntegerTy())
	weight = CW_SpecificReg;
	break;
	case 'I': // signed 16 bit immediate
	case 'J': // integer zero
	case 'K': // unsigned 16 bit immediate
	case 'L': // signed 32 bit immediate where lower 16 bits are 0
	case 'N': // immediate in the range of -65535 to -1 (inclusive)
	case 'O': // signed 15 bit immediate (+- 16383)
	case 'P': // immediate in the range of 65535 to 1 (inclusive)
	if (isa<ConstantInt>(CallOperandVal))
	weight = CW_Constant;
	break;
	case 'R':
	weight = CW_Memory;
	break;
	}
	return weight;
	}

	/// This is a helper function to parse a physical register string and split it
	/// into non-numeric and numeric parts (Prefix and Reg). The first boolean flag
	/// that is returned indicates whether parsing was successful. The second flag
	/// is true if the numeric part exists.
	static std::pair<bool, bool> parsePhysicalReg(StringRef C, StringRef &Prefix,
	unsigned long long &Reg) {
	if (C.front() != '{' \|\| C.back() != '}')
	return std::make_pair(false, false);

	// Search for the first numeric character.
	StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1;
	I = std::find_if(B, E, isdigit);

	Prefix = StringRef(B, I - B);

	// The second flag is set to false if no numeric characters were found.
	if (I == E)
	return std::make_pair(true, false);

	// Parse the numeric characters.
	return std::make_pair(!getAsUnsignedInteger(StringRef(I, E - I), 10, Reg),
	true);
	}

	std::pair<unsigned, const TargetRegisterClass *> MipsTargetLowering::
	parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
	const TargetRegisterInfo *TRI =
	Subtarget.getRegisterInfo();
	const TargetRegisterClass *RC;
	StringRef Prefix;
	unsigned long long Reg;

	std::pair<bool, bool> R = parsePhysicalReg(C, Prefix, Reg);

	if (!R.first)
	return std::make_pair(0U, nullptr);

	if ((Prefix == "hi" \|\| Prefix == "lo")) { // Parse hi/lo.
	// No numeric characters follow "hi" or "lo".
	if (R.second)
	return std::make_pair(0U, nullptr);

	RC = TRI->getRegClass(Prefix == "hi" ?
	Mips::HI32RegClassID : Mips::LO32RegClassID);
	return std::make_pair(*(RC->begin()), RC);
	} else if (Prefix.startswith("$msa")) {
	// Parse $msa(ir\|csr\|access\|save\|modify\|request\|map\|unmap)

	// No numeric characters follow the name.
	if (R.second)
	return std::make_pair(0U, nullptr);

	Reg = StringSwitch<unsigned long long>(Prefix)
	.Case("$msair", Mips::MSAIR)
	.Case("$msacsr", Mips::MSACSR)
	.Case("$msaaccess", Mips::MSAAccess)
	.Case("$msasave", Mips::MSASave)
	.Case("$msamodify", Mips::MSAModify)
	.Case("$msarequest", Mips::MSARequest)
	.Case("$msamap", Mips::MSAMap)
	.Case("$msaunmap", Mips::MSAUnmap)
	.Default(0);

	if (!Reg)
	return std::make_pair(0U, nullptr);

	RC = TRI->getRegClass(Mips::MSACtrlRegClassID);
	return std::make_pair(Reg, RC);
	}

	if (!R.second)
	return std::make_pair(0U, nullptr);

	if (Prefix == "$f") { // Parse $f0-$f31.
	// If the size of FP registers is 64-bit or Reg is an even number, select
	// the 64-bit register class. Otherwise, select the 32-bit register class.
	if (VT == MVT::Other)
	VT = (Subtarget.isFP64bit() \|\| !(Reg % 2)) ? MVT::f64 : MVT::f32;

	RC = getRegClassFor(VT);

	if (RC == &Mips::AFGR64RegClass) {
	assert(Reg % 2 == 0);
	Reg >>= 1;
	}
	} else if (Prefix == "$fcc") // Parse $fcc0-$fcc7.
	RC = TRI->getRegClass(Mips::FCCRegClassID);
	else if (Prefix == "$w") { // Parse $w0-$w31.
	RC = getRegClassFor((VT == MVT::Other) ? MVT::v16i8 : VT);
	} else { // Parse $0-$31.
	assert(Prefix == "$");
	RC = getRegClassFor((VT == MVT::Other) ? MVT::i32 : VT);
	}

	assert(Reg < RC->getNumRegs());
	return std::make_pair(*(RC->begin() + Reg), RC);
	}

	/// Given a register class constraint, like 'r', if this corresponds directly
	/// to an LLVM register class, return a register of 0 and the register class
	/// pointer.
	std::pair<unsigned, const TargetRegisterClass *>
	MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
	case 'y': // Same as 'r'. Exists for compatibility.
	case 'r':
	if (VT == MVT::i32 \|\| VT == MVT::i16 \|\| VT == MVT::i8) {
	if (Subtarget.inMips16Mode())
	return std::make_pair(0U, &Mips::CPU16RegsRegClass);
	return std::make_pair(0U, &Mips::GPR32RegClass);
	}
	if (VT == MVT::i64 && !Subtarget.isGP64bit())
	return std::make_pair(0U, &Mips::GPR32RegClass);
	if (VT == MVT::i64 && Subtarget.isGP64bit())
	return std::make_pair(0U, &Mips::GPR64RegClass);
	// This will generate an error message
	return std::make_pair(0U, nullptr);
	case 'f': // FPU or MSA register
	if (VT == MVT::v16i8)
	return std::make_pair(0U, &Mips::MSA128BRegClass);
	else if (VT == MVT::v8i16 \|\| VT == MVT::v8f16)
	return std::make_pair(0U, &Mips::MSA128HRegClass);
	else if (VT == MVT::v4i32 \|\| VT == MVT::v4f32)
	return std::make_pair(0U, &Mips::MSA128WRegClass);
	else if (VT == MVT::v2i64 \|\| VT == MVT::v2f64)
	return std::make_pair(0U, &Mips::MSA128DRegClass);
	else if (VT == MVT::f32)
	return std::make_pair(0U, &Mips::FGR32RegClass);
	else if ((VT == MVT::f64) && (!Subtarget.isSingleFloat())) {
	if (Subtarget.isFP64bit())
	return std::make_pair(0U, &Mips::FGR64RegClass);
	return std::make_pair(0U, &Mips::AFGR64RegClass);
	}
	break;
	case 'c': // register suitable for indirect jump
	if (VT == MVT::i32)
	return std::make_pair((unsigned)Mips::T9, &Mips::GPR32RegClass);
	if (VT == MVT::i64)
	return std::make_pair((unsigned)Mips::T9_64, &Mips::GPR64RegClass);
	// This will generate an error message
	return std::make_pair(0U, nullptr);
	case 'l': // use the `lo` register to store values
	// that are no bigger than a word
	if (VT == MVT::i32)
	return std::make_pair((unsigned)Mips::LO0, &Mips::LO32RegClass);
	return std::make_pair((unsigned)Mips::LO0_64, &Mips::LO64RegClass);
	case 'x': // use the concatenated `hi` and `lo` registers
	// to store doubleword values
	// Fixme: Not triggering the use of both hi and low
	// This will generate an error message
	return std::make_pair(0U, nullptr);
	}
	}

	std::pair<unsigned, const TargetRegisterClass *> R;
	R = parseRegForInlineAsmConstraint(Constraint, VT);

	if (R.second)
	return R;

	return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
	}

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	SDValue Result;

	// Only support length 1 constraints for now.
	if (Constraint.length() > 1) return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break; // This will fall through to the generic implementation
	case 'I': // Signed 16 bit constant
	// If this fails, the parent routine will give an error
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	EVT Type = Op.getValueType();
	int64_t Val = C->getSExtValue();
	if (isInt<16>(Val)) {
	Result = DAG.getTargetConstant(Val, DL, Type);
	break;
	}
	}
	return;
	case 'J': // integer zero
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	EVT Type = Op.getValueType();
	int64_t Val = C->getZExtValue();
	if (Val == 0) {
	Result = DAG.getTargetConstant(0, DL, Type);
	break;
	}
	}
	return;
	case 'K': // unsigned 16 bit immediate
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	EVT Type = Op.getValueType();
	uint64_t Val = (uint64_t)C->getZExtValue();
	if (isUInt<16>(Val)) {
	Result = DAG.getTargetConstant(Val, DL, Type);
	break;
	}
	}
	return;
	case 'L': // signed 32 bit immediate where lower 16 bits are 0
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	EVT Type = Op.getValueType();
	int64_t Val = C->getSExtValue();
	if ((isInt<32>(Val)) && ((Val & 0xffff) == 0)){
	Result = DAG.getTargetConstant(Val, DL, Type);
	break;
	}
	}
	return;
	case 'N': // immediate in the range of -65535 to -1 (inclusive)
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	EVT Type = Op.getValueType();
	int64_t Val = C->getSExtValue();
	if ((Val >= -65535) && (Val <= -1)) {
	Result = DAG.getTargetConstant(Val, DL, Type);
	break;
	}
	}
	return;
	case 'O': // signed 15 bit immediate
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	EVT Type = Op.getValueType();
	int64_t Val = C->getSExtValue();
	if ((isInt<15>(Val))) {
	Result = DAG.getTargetConstant(Val, DL, Type);
	break;
	}
	}
	return;
	case 'P': // immediate in the range of 1 to 65535 (inclusive)
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	EVT Type = Op.getValueType();
	int64_t Val = C->getSExtValue();
	if ((Val <= 65535) && (Val >= 1)) {
	Result = DAG.getTargetConstant(Val, DL, Type);
	break;
	}
	}
	return;
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}

	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	bool MipsTargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS, Instruction *I) const {
	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	switch (AM.Scale) {
	case 0: // "r+i" or just "i", depending on HasBaseReg.
	break;
	case 1:
	if (!AM.HasBaseReg) // allow "r+i".
	break;
	return false; // disallow "r+r" or "r+r+i".
	default:
	return false;
	}

	return true;
	}

	bool
	MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
	// The Mips target isn't yet aware of offsets.
	return false;
	}

	EVT MipsTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
	unsigned SrcAlign,
	bool IsMemset, bool ZeroMemset,
	bool MemcpyStrSrc,
	MachineFunction &MF) const {
	if (Subtarget.hasMips64())
	return MVT::i64;

	return MVT::i32;
	}

	bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
	if (VT != MVT::f32 && VT != MVT::f64)
	return false;
	if (Imm.isNegZero())
	return false;
	return Imm.isZero();
	}

	unsigned MipsTargetLowering::getJumpTableEncoding() const {

	// FIXME: For space reasons this should be: EK_GPRel32BlockAddress.
	if (ABI.IsN64() && isPositionIndependent())
	return MachineJumpTableInfo::EK_GPRel64BlockAddress;

	return TargetLowering::getJumpTableEncoding();
	}

	bool MipsTargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	void MipsTargetLowering::copyByValRegs(
	SDValue Chain, const SDLoc &DL, std::vector<SDValue> &OutChains,
	SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
	SmallVectorImpl<SDValue> &InVals, const Argument *FuncArg,
	unsigned FirstReg, unsigned LastReg, const CCValAssign &VA,
	MipsCCState &State) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	unsigned GPRSizeInBytes = Subtarget.getGPRSizeInBytes();
	unsigned NumRegs = LastReg - FirstReg;
	unsigned RegAreaSize = NumRegs * GPRSizeInBytes;
	unsigned FrameObjSize = std::max(Flags.getByValSize(), RegAreaSize);
	int FrameObjOffset;
	ArrayRef<MCPhysReg> ByValArgRegs = ABI.GetByValArgRegs();

	if (RegAreaSize)
	FrameObjOffset =
	(int)ABI.GetCalleeAllocdArgSizeInBytes(State.getCallingConv()) -
	(int)((ByValArgRegs.size() - FirstReg) * GPRSizeInBytes);
	else
	FrameObjOffset = VA.getLocMemOffset();

	// Create frame object.
	EVT PtrTy = getPointerTy(DAG.getDataLayout());
	int FI = MFI.CreateFixedObject(FrameObjSize, FrameObjOffset, true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
	InVals.push_back(FIN);

	if (!NumRegs)
	return;

	// Copy arg registers.
	MVT RegTy = MVT::getIntegerVT(GPRSizeInBytes * 8);
	const TargetRegisterClass *RC = getRegClassFor(RegTy);

	for (unsigned I = 0; I < NumRegs; ++I) {
	unsigned ArgReg = ByValArgRegs[FirstReg + I];
	unsigned VReg = addLiveIn(MF, ArgReg, RC);
	unsigned Offset = I * GPRSizeInBytes;
	SDValue StorePtr = DAG.getNode(ISD::ADD, DL, PtrTy, FIN,
	DAG.getConstant(Offset, DL, PtrTy));
	SDValue Store = DAG.getStore(Chain, DL, DAG.getRegister(VReg, RegTy),
	StorePtr, MachinePointerInfo(FuncArg, Offset));
	OutChains.push_back(Store);
	}
	}

	// Copy byVal arg to registers and stack.
	void MipsTargetLowering::passByValArg(
	SDValue Chain, const SDLoc &DL,
	std::deque<std::pair<unsigned, SDValue>> &RegsToPass,
	SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
	MachineFrameInfo &MFI, SelectionDAG &DAG, SDValue Arg, unsigned FirstReg,
	unsigned LastReg, const ISD::ArgFlagsTy &Flags, bool isLittle,
	const CCValAssign &VA) const {
	unsigned ByValSizeInBytes = Flags.getByValSize();
	unsigned OffsetInBytes = 0; // From beginning of struct
	unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
	unsigned Alignment = std::min(Flags.getByValAlign(), RegSizeInBytes);
	EVT PtrTy = getPointerTy(DAG.getDataLayout()),
	RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
	unsigned NumRegs = LastReg - FirstReg;

	if (NumRegs) {
	ArrayRef<MCPhysReg> ArgRegs = ABI.GetByValArgRegs();
	bool LeftoverBytes = (NumRegs * RegSizeInBytes > ByValSizeInBytes);
	unsigned I = 0;

	// Copy words to registers.
	for (; I < NumRegs - LeftoverBytes; ++I, OffsetInBytes += RegSizeInBytes) {
	SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
	DAG.getConstant(OffsetInBytes, DL, PtrTy));
	SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
	MachinePointerInfo(), Alignment);
	MemOpChains.push_back(LoadVal.getValue(1));
	unsigned ArgReg = ArgRegs[FirstReg + I];
	RegsToPass.push_back(std::make_pair(ArgReg, LoadVal));
	}

	// Return if the struct has been fully copied.
	if (ByValSizeInBytes == OffsetInBytes)
	return;

	// Copy the remainder of the byval argument with sub-word loads and shifts.
	if (LeftoverBytes) {
	SDValue Val;

	for (unsigned LoadSizeInBytes = RegSizeInBytes / 2, TotalBytesLoaded = 0;
	OffsetInBytes < ByValSizeInBytes; LoadSizeInBytes /= 2) {
	unsigned RemainingSizeInBytes = ByValSizeInBytes - OffsetInBytes;

	if (RemainingSizeInBytes < LoadSizeInBytes)
	continue;

	// Load subword.
	SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
	DAG.getConstant(OffsetInBytes, DL,
	PtrTy));
	SDValue LoadVal = DAG.getExtLoad(
	ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(),
	MVT::getIntegerVT(LoadSizeInBytes * 8), Alignment);
	MemOpChains.push_back(LoadVal.getValue(1));

	// Shift the loaded value.
	unsigned Shamt;

	if (isLittle)
	Shamt = TotalBytesLoaded * 8;
	else
	Shamt = (RegSizeInBytes - (TotalBytesLoaded + LoadSizeInBytes)) * 8;

	SDValue Shift = DAG.getNode(ISD::SHL, DL, RegTy, LoadVal,
	DAG.getConstant(Shamt, DL, MVT::i32));

	if (Val.getNode())
	Val = DAG.getNode(ISD::OR, DL, RegTy, Val, Shift);
	else
	Val = Shift;

	OffsetInBytes += LoadSizeInBytes;
	TotalBytesLoaded += LoadSizeInBytes;
	Alignment = std::min(Alignment, LoadSizeInBytes);
	}

	unsigned ArgReg = ArgRegs[FirstReg + I];
	RegsToPass.push_back(std::make_pair(ArgReg, Val));
	return;
	}
	}

	// Copy remainder of byval arg to it with memcpy.
	unsigned MemCpySize = ByValSizeInBytes - OffsetInBytes;
	SDValue Src = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
	DAG.getConstant(OffsetInBytes, DL, PtrTy));
	SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
	DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
	Chain = DAG.getMemcpy(Chain, DL, Dst, Src,
	DAG.getConstant(MemCpySize, DL, PtrTy),
	Alignment, /isVolatile=/false, /AlwaysInline=/false,
	/isTailCall=/false,
	MachinePointerInfo(), MachinePointerInfo());
	MemOpChains.push_back(Chain);
	}

	void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
	SDValue Chain, const SDLoc &DL,
	SelectionDAG &DAG,
	CCState &State) const {
	ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs();
	unsigned Idx = State.getFirstUnallocated(ArgRegs);
	unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
	MVT RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
	const TargetRegisterClass *RC = getRegClassFor(RegTy);
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();

	// Offset of the first variable argument from stack pointer.
	int VaArgOffset;

	if (ArgRegs.size() == Idx)
	VaArgOffset = alignTo(State.getNextStackOffset(), RegSizeInBytes);
	else {
	VaArgOffset =
	(int)ABI.GetCalleeAllocdArgSizeInBytes(State.getCallingConv()) -
	(int)(RegSizeInBytes * (ArgRegs.size() - Idx));
	}

	// Record the frame index of the first variable argument
	// which is a value necessary to VASTART.
	int FI = MFI.CreateFixedObject(RegSizeInBytes, VaArgOffset, true);
	MipsFI->setVarArgsFrameIndex(FI);

	// Copy the integer registers that have not been used for argument passing
	// to the argument register save area. For O32, the save area is allocated
	// in the caller's stack frame, while for N32/64, it is allocated in the
	// callee's stack frame.
	for (unsigned I = Idx; I < ArgRegs.size();
	++I, VaArgOffset += RegSizeInBytes) {
	unsigned Reg = addLiveIn(MF, ArgRegs[I], RC);
	SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegTy);
	FI = MFI.CreateFixedObject(RegSizeInBytes, VaArgOffset, true);
	SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	SDValue Store =
	DAG.getStore(Chain, DL, ArgValue, PtrOff, MachinePointerInfo());
	cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue(
	(Value *)nullptr);
	OutChains.push_back(Store);
	}
	}

	void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
	unsigned Align) const {
	const TargetFrameLowering *TFL = Subtarget.getFrameLowering();

	assert(Size && "Byval argument's size shouldn't be 0.");

	Align = std::min(Align, TFL->getStackAlignment());

	unsigned FirstReg = 0;
	unsigned NumRegs = 0;

	if (State->getCallingConv() != CallingConv::Fast) {
	unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
	ArrayRef<MCPhysReg> IntArgRegs = ABI.GetByValArgRegs();
	// FIXME: The O32 case actually describes no shadow registers.
	const MCPhysReg *ShadowRegs =
	ABI.IsO32() ? IntArgRegs.data() : Mips64DPRegs;

	// We used to check the size as well but we can't do that anymore since
	// CCState::HandleByVal() rounds up the size after calling this function.
	assert(!(Align % RegSizeInBytes) &&
	"Byval argument's alignment should be a multiple of"
	"RegSizeInBytes.");

	FirstReg = State->getFirstUnallocated(IntArgRegs);

	// If Align > RegSizeInBytes, the first arg register must be even.
	// FIXME: This condition happens to do the right thing but it's not the
	// right way to test it. We want to check that the stack frame offset
	// of the register is aligned.
	if ((Align > RegSizeInBytes) && (FirstReg % 2)) {
	State->AllocateReg(IntArgRegs[FirstReg], ShadowRegs[FirstReg]);
	++FirstReg;
	}

	// Mark the registers allocated.
	Size = alignTo(Size, RegSizeInBytes);
	for (unsigned I = FirstReg; Size > 0 && (I < IntArgRegs.size());
	Size -= RegSizeInBytes, ++I, ++NumRegs)
	State->AllocateReg(IntArgRegs[I], ShadowRegs[I]);
	}

	State->addInRegsParamInfo(FirstReg, FirstReg + NumRegs);
	}

	MachineBasicBlock *MipsTargetLowering::emitPseudoSELECT(MachineInstr &MI,
	MachineBasicBlock *BB,
	bool isFPCmp,
	unsigned Opc) const {
	assert(!(Subtarget.hasMips4() \|\| Subtarget.hasMips32()) &&
	"Subtarget already supports SELECT nodes with the use of"
	"conditional-move instructions.");

	const TargetInstrInfo *TII =
	Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	// To "insert" a SELECT instruction, we actually have to insert the
	// diamond control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch on, the
	// true/false values to select between, and a branch opcode to use.
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction::iterator It = ++BB->getIterator();

	// thisMBB:
	// ...
	// TrueVal = ...
	// setcc r1, r2, r3
	// bNE r1, r0, copy1MBB
	// fallthrough --> copy0MBB
	MachineBasicBlock *thisMBB = BB;
	MachineFunction *F = BB->getParent();
	MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, copy0MBB);
	F->insert(It, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Next, add the true and fallthrough blocks as its successors.
	BB->addSuccessor(copy0MBB);
	BB->addSuccessor(sinkMBB);

	if (isFPCmp) {
	// bc1[tf] cc, sinkMBB
	BuildMI(BB, DL, TII->get(Opc))
	.addReg(MI.getOperand(1).getReg())
	.addMBB(sinkMBB);
	} else {
	// bne rs, $0, sinkMBB
	BuildMI(BB, DL, TII->get(Opc))
	.addReg(MI.getOperand(1).getReg())
	.addReg(Mips::ZERO)
	.addMBB(sinkMBB);
	}

	// copy0MBB:
	// %FalseValue = ...
	// # fallthrough to sinkMBB
	BB = copy0MBB;

	// Update machine-CFG edges
	BB->addSuccessor(sinkMBB);

	// sinkMBB:
	// %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
	// ...
	BB = sinkMBB;

	BuildMI(*BB, BB->begin(), DL, TII->get(Mips::PHI), MI.getOperand(0).getReg())
	.addReg(MI.getOperand(2).getReg())
	.addMBB(thisMBB)
	.addReg(MI.getOperand(3).getReg())
	.addMBB(copy0MBB);

	MI.eraseFromParent(); // The pseudo instruction is gone now.

	return BB;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned MipsTargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	// Named registers is expected to be fairly rare. For now, just support $28
	// since the linux kernel uses it.
	if (Subtarget.isGP64bit()) {
	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("$28", Mips::GP_64)
	.Default(0);
	if (Reg)
	return Reg;
	} else {
	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("$28", Mips::GP)
	.Default(0);
	if (Reg)
	return Reg;
	}
	report_fatal_error("Invalid register name global variable");
	}
	Index: head/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp (revision 328817)
	@@ -1,187 +1,194 @@
	//===-- MipsTargetObjectFile.cpp - Mips Object Files ----------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "MipsTargetObjectFile.h"
	#include "MipsSubtarget.h"
	#include "MipsTargetMachine.h"
	#include "llvm/BinaryFormat/ELF.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCSectionELF.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Target/TargetMachine.h"
	using namespace llvm;

	static cl::opt<unsigned>
	SSThreshold("mips-ssection-threshold", cl::Hidden,
	cl::desc("Small data and bss section threshold size (default=8)"),
	cl::init(8));

	static cl::opt<bool>
	LocalSData("mlocal-sdata", cl::Hidden,
	cl::desc("MIPS: Use gp_rel for object-local data."),
	cl::init(true));

	static cl::opt<bool>
	ExternSData("mextern-sdata", cl::Hidden,
	cl::desc("MIPS: Use gp_rel for data that is not defined by the "
	"current object."),
	cl::init(true));

	static cl::opt<bool>
	EmbeddedData("membedded-data", cl::Hidden,
	cl::desc("MIPS: Try to allocate variables in the following"
	" sections if possible: .rodata, .sdata, .data ."),
	cl::init(false));

	void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
	TargetLoweringObjectFileELF::Initialize(Ctx, TM);
	InitializeELF(TM.Options.UseInitArray);

	SmallDataSection = getContext().getELFSection(
	".sdata", ELF::SHT_PROGBITS,
	ELF::SHF_WRITE \| ELF::SHF_ALLOC \| ELF::SHF_MIPS_GPREL);

	SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
	ELF::SHF_WRITE \| ELF::SHF_ALLOC \|
	ELF::SHF_MIPS_GPREL);
	this->TM = &static_cast<const MipsTargetMachine &>(TM);
	}

	// A address must be loaded from a small section if its size is less than the
	// small section size threshold. Data in this section must be addressed using
	// gp_rel operator.
	static bool IsInSmallSection(uint64_t Size) {
	// gcc has traditionally not treated zero-sized objects as small data, so this
	// is effectively part of the ABI.
	return Size > 0 && Size <= SSThreshold;
	}

	/// Return true if this global address should be placed into small data/bss
	/// section.
	bool MipsTargetObjectFile::IsGlobalInSmallSection(
	const GlobalObject *GO, const TargetMachine &TM) const {
	// We first check the case where global is a declaration, because finding
	// section kind using getKindForGlobal() is only allowed for global
	// definitions.
	if (GO->isDeclaration() \|\| GO->hasAvailableExternallyLinkage())
	return IsGlobalInSmallSectionImpl(GO, TM);

	return IsGlobalInSmallSection(GO, TM, getKindForGlobal(GO, TM));
	}

	/// Return true if this global address should be placed into small data/bss
	/// section.
	bool MipsTargetObjectFile::
	IsGlobalInSmallSection(const GlobalObject *GO, const TargetMachine &TM,
	SectionKind Kind) const {
	return IsGlobalInSmallSectionImpl(GO, TM) &&
	(Kind.isData() \|\| Kind.isBSS() \|\| Kind.isCommon() \|\|
	Kind.isReadOnly());
	}

	/// Return true if this global address should be placed into small data/bss
	/// section. This method does all the work, except for checking the section
	/// kind.
	bool MipsTargetObjectFile::
	IsGlobalInSmallSectionImpl(const GlobalObject *GO,
	const TargetMachine &TM) const {
	const MipsSubtarget &Subtarget =
	*static_cast<const MipsTargetMachine &>(TM).getSubtargetImpl();

	// Return if small section is not available.
	if (!Subtarget.useSmallSection())
	return false;

	// Only global variables, not functions.
	const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GO);
	if (!GVA)
	return false;

	// If the variable has an explicit section, it is placed in that section but
	// it's addressing mode may change.
	if (GVA->hasSection()) {
	StringRef Section = GVA->getSection();

	// Explicitly placing any variable in the small data section overrides
	// the global -G value.
	if (Section == ".sdata" \|\| Section == ".sbss")
	return true;

	// Otherwise reject accessing it through the gp pointer. There are some
	// historic cases which GCC doesn't appear to respect any more. These
	// are .lit4, .lit8 and .srdata. For the moment reject these as well.
	return false;
	}

	// Enforce -mlocal-sdata.
	if (!LocalSData && GVA->hasLocalLinkage())
	return false;

	// Enforce -mextern-sdata.
	if (!ExternSData && ((GVA->hasExternalLinkage() && GVA->isDeclaration()) \|\|
	GVA->hasCommonLinkage()))
	return false;

	// Enforce -membedded-data.
	if (EmbeddedData && GVA->isConstant())
	return false;

	Type *Ty = GVA->getValueType();
	+
	+ // It is possible that the type of the global is unsized, i.e. a declaration
	+ // of a extern struct. In this case don't presume it is in the small data
	+ // section. This happens e.g. when building the FreeBSD kernel.
	+ if (!Ty->isSized())
	+ return false;
	+
	return IsInSmallSection(
	GVA->getParent()->getDataLayout().getTypeAllocSize(Ty));
	}

	MCSection *MipsTargetObjectFile::SelectSectionForGlobal(
	const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
	// TODO: Could also support "weak" symbols as well with ".gnu.linkonce.s.*"
	// sections?

	// Handle Small Section classification here.
	if (Kind.isBSS() && IsGlobalInSmallSection(GO, TM, Kind))
	return SmallBSSSection;
	if (Kind.isData() && IsGlobalInSmallSection(GO, TM, Kind))
	return SmallDataSection;
	if (Kind.isReadOnly() && IsGlobalInSmallSection(GO, TM, Kind))
	return SmallDataSection;

	// Otherwise, we work the same as ELF.
	return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
	}

	/// Return true if this constant should be placed into small data section.
	bool MipsTargetObjectFile::IsConstantInSmallSection(
	const DataLayout &DL, const Constant *CN, const TargetMachine &TM) const {
	return (static_cast<const MipsTargetMachine &>(TM)
	.getSubtargetImpl()
	->useSmallSection() &&
	LocalSData && IsInSmallSection(DL.getTypeAllocSize(CN->getType())));
	}

	/// Return true if this constant should be placed into small data section.
	MCSection *MipsTargetObjectFile::getSectionForConstant(const DataLayout &DL,
	SectionKind Kind,
	const Constant *C,
	unsigned &Align) const {
	if (IsConstantInSmallSection(DL, C, *TM))
	return SmallDataSection;

	// Otherwise, we work the same as ELF.
	return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C, Align);
	}

	const MCExpr *
	MipsTargetObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
	const MCExpr *Expr =
	MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
	return MCBinaryExpr::createAdd(
	Expr, MCConstantExpr::create(0x8000, getContext()), getContext());
	}
	Index: head/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp (revision 328817)
	@@ -1,367 +1,386 @@
	//===-- SparcFrameLowering.cpp - Sparc Frame Information ------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the Sparc implementation of TargetFrameLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "SparcFrameLowering.h"
	#include "SparcInstrInfo.h"
	#include "SparcMachineFunctionInfo.h"
	#include "SparcSubtarget.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/Function.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Target/TargetOptions.h"

	using namespace llvm;

	static cl::opt<bool>
	DisableLeafProc("disable-sparc-leaf-proc",
	cl::init(false),
	cl::desc("Disable Sparc leaf procedure optimization."),
	cl::Hidden);

	SparcFrameLowering::SparcFrameLowering(const SparcSubtarget &ST)
	: TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
	ST.is64Bit() ? 16 : 8, 0, ST.is64Bit() ? 16 : 8) {}

	void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	int NumBytes,
	unsigned ADDrr,
	unsigned ADDri) const {

	DebugLoc dl;
	const SparcInstrInfo &TII =
	static_cast<const SparcInstrInfo >(MF.getSubtarget().getInstrInfo());

	if (NumBytes >= -4096 && NumBytes < 4096) {
	BuildMI(MBB, MBBI, dl, TII.get(ADDri), SP::O6)
	.addReg(SP::O6).addImm(NumBytes);
	return;
	}

	// Emit this the hard way. This clobbers G1 which we always know is
	// available here.
	if (NumBytes >= 0) {
	// Emit nonnegative numbers with sethi + or.
	// sethi %hi(NumBytes), %g1
	// or %g1, %lo(NumBytes), %g1
	// add %sp, %g1, %sp
	BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
	.addImm(HI22(NumBytes));
	BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
	.addReg(SP::G1).addImm(LO10(NumBytes));
	BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
	.addReg(SP::O6).addReg(SP::G1);
	return ;
	}

	// Emit negative numbers with sethi + xor.
	// sethi %hix(NumBytes), %g1
	// xor %g1, %lox(NumBytes), %g1
	// add %sp, %g1, %sp
	BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
	.addImm(HIX22(NumBytes));
	BuildMI(MBB, MBBI, dl, TII.get(SP::XORri), SP::G1)
	.addReg(SP::G1).addImm(LOX10(NumBytes));
	BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
	.addReg(SP::O6).addReg(SP::G1);
	}

	void SparcFrameLowering::emitPrologue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();

	assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
	MachineFrameInfo &MFI = MF.getFrameInfo();
	+ const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
	const SparcInstrInfo &TII =
	- static_cast<const SparcInstrInfo >(MF.getSubtarget().getInstrInfo());
	+ static_cast<const SparcInstrInfo >(Subtarget.getInstrInfo());
	const SparcRegisterInfo &RegInfo =
	- static_cast<const SparcRegisterInfo >(MF.getSubtarget().getRegisterInfo());
	+ static_cast<const SparcRegisterInfo >(Subtarget.getRegisterInfo());
	MachineBasicBlock::iterator MBBI = MBB.begin();
	// Debug location must be unknown since the first debug location is used
	// to determine the end of the prologue.
	DebugLoc dl;
	bool NeedsStackRealignment = RegInfo.needsStackRealignment(MF);

	// FIXME: unfortunately, returning false from canRealignStack
	// actually just causes needsStackRealignment to return false,
	// rather than reporting an error, as would be sensible. This is
	// poor, but fixing that bogosity is going to be a large project.
	// For now, just see if it's lied, and report an error here.
	if (!NeedsStackRealignment && MFI.getMaxAlignment() > getStackAlignment())
	report_fatal_error("Function \"" + Twine(MF.getName()) + "\" required "
	"stack re-alignment, but LLVM couldn't handle it "
	"(probably because it has a dynamic alloca).");

	// Get the number of bytes to allocate from the FrameInfo
	int NumBytes = (int) MFI.getStackSize();

	unsigned SAVEri = SP::SAVEri;
	unsigned SAVErr = SP::SAVErr;
	if (FuncInfo->isLeafProc()) {
	if (NumBytes == 0)
	return;
	SAVEri = SP::ADDri;
	SAVErr = SP::ADDrr;
	}

	// The SPARC ABI is a bit odd in that it requires a reserved 92-byte
	// (128 in v9) area in the user's stack, starting at %sp. Thus, the
	// first part of the stack that can actually be used is located at
	// %sp + 92.
	//
	// We therefore need to add that offset to the total stack size
	// after all the stack objects are placed by
	// PrologEpilogInserter calculateFrameObjectOffsets. However, since the stack needs to be
	// aligned after the extra size is added, we need to disable
	// calculateFrameObjectOffsets's built-in stack alignment, by having
	// targetHandlesStackFrameRounding return true.


	// Add the extra call frame stack size, if needed. (This is the same
	// code as in PrologEpilogInserter, but also gets disabled by
	// targetHandlesStackFrameRounding)
	if (MFI.adjustsStack() && hasReservedCallFrame(MF))
	NumBytes += MFI.getMaxCallFrameSize();

	// Adds the SPARC subtarget-specific spill area to the stack
	// size. Also ensures target-required alignment.
	- NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes);
	+ NumBytes = Subtarget.getAdjustedFrameSize(NumBytes);

	// Finally, ensure that the size is sufficiently aligned for the
	// data on the stack.
	if (MFI.getMaxAlignment() > 0) {
	NumBytes = alignTo(NumBytes, MFI.getMaxAlignment());
	}

	// Update stack size with corrected value.
	MFI.setStackSize(NumBytes);

	emitSPAdjustment(MF, MBB, MBBI, -NumBytes, SAVErr, SAVEri);

	unsigned regFP = RegInfo.getDwarfRegNum(SP::I6, true);

	// Emit ".cfi_def_cfa_register 30".
	unsigned CFIIndex =
	MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, regFP));
	BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex);

	// Emit ".cfi_window_save".
	CFIIndex = MF.addFrameInst(MCCFIInstruction::createWindowSave(nullptr));
	BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex);

	unsigned regInRA = RegInfo.getDwarfRegNum(SP::I7, true);
	unsigned regOutRA = RegInfo.getDwarfRegNum(SP::O7, true);
	// Emit ".cfi_register 15, 31".
	CFIIndex = MF.addFrameInst(
	MCCFIInstruction::createRegister(nullptr, regOutRA, regInRA));
	BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex);

	if (NeedsStackRealignment) {
	- // andn %o6, MaxAlign-1, %o6
	+ int64_t Bias = Subtarget.getStackPointerBias();
	+ unsigned regUnbiased;
	+ if (Bias) {
	+ // This clobbers G1 which we always know is available here.
	+ regUnbiased = SP::G1;
	+ // add %o6, BIAS, %g1
	+ BuildMI(MBB, MBBI, dl, TII.get(SP::ADDri), regUnbiased)
	+ .addReg(SP::O6).addImm(Bias);
	+ } else
	+ regUnbiased = SP::O6;
	+
	+ // andn %regUnbiased, MaxAlign-1, %regUnbiased
	int MaxAlign = MFI.getMaxAlignment();
	- BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), SP::O6).addReg(SP::O6).addImm(MaxAlign - 1);
	+ BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), regUnbiased)
	+ .addReg(regUnbiased).addImm(MaxAlign - 1);
	+
	+ if (Bias) {
	+ // add %g1, -BIAS, %o6
	+ BuildMI(MBB, MBBI, dl, TII.get(SP::ADDri), SP::O6)
	+ .addReg(regUnbiased).addImm(-Bias);
	+ }
	}
	}

	MachineBasicBlock::iterator SparcFrameLowering::
	eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I) const {
	if (!hasReservedCallFrame(MF)) {
	MachineInstr &MI = *I;
	int Size = MI.getOperand(0).getImm();
	if (MI.getOpcode() == SP::ADJCALLSTACKDOWN)
	Size = -Size;

	if (Size)
	emitSPAdjustment(MF, MBB, I, Size, SP::ADDrr, SP::ADDri);
	}
	return MBB.erase(I);
	}


	void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
	MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
	const SparcInstrInfo &TII =
	static_cast<const SparcInstrInfo >(MF.getSubtarget().getInstrInfo());
	DebugLoc dl = MBBI->getDebugLoc();
	assert(MBBI->getOpcode() == SP::RETL &&
	"Can only put epilog before 'retl' instruction!");
	if (!FuncInfo->isLeafProc()) {
	BuildMI(MBB, MBBI, dl, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0)
	.addReg(SP::G0);
	return;
	}
	MachineFrameInfo &MFI = MF.getFrameInfo();

	int NumBytes = (int) MFI.getStackSize();
	if (NumBytes == 0)
	return;

	emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri);
	}

	bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
	// Reserve call frame if there are no variable sized objects on the stack.
	return !MF.getFrameInfo().hasVarSizedObjects();
	}

	// hasFP - Return true if the specified function should have a dedicated frame
	// pointer register. This is true if the function has variable sized allocas or
	// if frame pointer elimination is disabled.
	bool SparcFrameLowering::hasFP(const MachineFunction &MF) const {
	const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();

	const MachineFrameInfo &MFI = MF.getFrameInfo();
	return MF.getTarget().Options.DisableFramePointerElim(MF) \|\|
	RegInfo->needsStackRealignment(MF) \|\|
	MFI.hasVarSizedObjects() \|\|
	MFI.isFrameAddressTaken();
	}


	int SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
	unsigned &FrameReg) const {
	const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const SparcRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	const SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
	bool isFixed = MFI.isFixedObjectIndex(FI);

	// Addressable stack objects are accessed using neg. offsets from
	// %fp, or positive offsets from %sp.
	bool UseFP;

	// Sparc uses FP-based references in general, even when "hasFP" is
	// false. That function is rather a misnomer, because %fp is
	// actually always available, unless isLeafProc.
	if (FuncInfo->isLeafProc()) {
	// If there's a leaf proc, all offsets need to be %sp-based,
	// because we haven't caused %fp to actually point to our frame.
	UseFP = false;
	} else if (isFixed) {
	// Otherwise, argument access should always use %fp.
	UseFP = true;
	} else if (RegInfo->needsStackRealignment(MF)) {
	// If there is dynamic stack realignment, all local object
	// references need to be via %sp, to take account of the
	// re-alignment.
	UseFP = false;
	} else {
	// Finally, default to using %fp.
	UseFP = true;
	}

	int64_t FrameOffset = MF.getFrameInfo().getObjectOffset(FI) +
	Subtarget.getStackPointerBias();

	if (UseFP) {
	FrameReg = RegInfo->getFrameRegister(MF);
	return FrameOffset;
	} else {
	FrameReg = SP::O6; // %sp
	return FrameOffset + MF.getFrameInfo().getStackSize();
	}
	}

	static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
	{

	for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
	if (MRI->isPhysRegUsed(reg))
	return false;

	for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
	if (MRI->isPhysRegUsed(reg))
	return false;

	return true;
	}

	bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
	{

	MachineRegisterInfo &MRI = MF.getRegInfo();
	MachineFrameInfo &MFI = MF.getFrameInfo();

	return !(MFI.hasCalls() // has calls
	\|\| MRI.isPhysRegUsed(SP::L0) // Too many registers needed
	\|\| MRI.isPhysRegUsed(SP::O6) // %sp is used
	\|\| hasFP(MF)); // need %fp
	}

	void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
	MachineRegisterInfo &MRI = MF.getRegInfo();
	// Remap %i[0-7] to %o[0-7].
	for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
	if (!MRI.isPhysRegUsed(reg))
	continue;

	unsigned mapped_reg = reg - SP::I0 + SP::O0;

	// Replace I register with O register.
	MRI.replaceRegWith(reg, mapped_reg);

	// Also replace register pair super-registers.
	if ((reg - SP::I0) % 2 == 0) {
	unsigned preg = (reg - SP::I0) / 2 + SP::I0_I1;
	unsigned mapped_preg = preg - SP::I0_I1 + SP::O0_O1;
	MRI.replaceRegWith(preg, mapped_preg);
	}
	}

	// Rewrite MBB's Live-ins.
	for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
	MBB != E; ++MBB) {
	for (unsigned reg = SP::I0_I1; reg <= SP::I6_I7; ++reg) {
	if (!MBB->isLiveIn(reg))
	continue;
	MBB->removeLiveIn(reg);
	MBB->addLiveIn(reg - SP::I0_I1 + SP::O0_O1);
	}
	for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
	if (!MBB->isLiveIn(reg))
	continue;
	MBB->removeLiveIn(reg);
	MBB->addLiveIn(reg - SP::I0 + SP::O0);
	}
	}

	assert(verifyLeafProcRegUse(&MRI));
	#ifdef EXPENSIVE_CHECKS
	MF.verify(0, "After LeafProc Remapping");
	#endif
	}

	void SparcFrameLowering::determineCalleeSaves(MachineFunction &MF,
	BitVector &SavedRegs,
	RegScavenger *RS) const {
	TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
	if (!DisableLeafProc && isLeafProc(MF)) {
	SparcMachineFunctionInfo *MFI = MF.getInfo<SparcMachineFunctionInfo>();
	MFI->setLeafProc(true);

	remapRegsForLeafProc(MF);
	}

	}
	Index: head/contrib/llvm/lib/Target/X86/X86.h
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86.h (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86.h (revision 328817)
	@@ -1,113 +1,117 @@
	//===-- X86.h - Top-level interface for X86 representation ------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the entry points for global functions defined in the x86
	// target library, as used by the LLVM JIT.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_X86_X86_H
	#define LLVM_LIB_TARGET_X86_X86_H

	#include "llvm/Support/CodeGen.h"

	namespace llvm {

	class FunctionPass;
	class ImmutablePass;
	class InstructionSelector;
	+class ModulePass;
	class PassRegistry;
	class X86RegisterBankInfo;
	class X86Subtarget;
	class X86TargetMachine;

	/// This pass converts a legalized DAG into a X86-specific DAG, ready for
	/// instruction scheduling.
	FunctionPass *createX86ISelDag(X86TargetMachine &TM,
	CodeGenOpt::Level OptLevel);

	/// This pass initializes a global base register for PIC on x86-32.
	FunctionPass *createX86GlobalBaseRegPass();

	/// This pass combines multiple accesses to local-dynamic TLS variables so that
	/// the TLS base address for the module is only fetched once per execution path
	/// through the function.
	FunctionPass *createCleanupLocalDynamicTLSPass();

	/// This function returns a pass which converts floating-point register
	/// references and pseudo instructions into floating-point stack references and
	/// physical instructions.
	FunctionPass *createX86FloatingPointStackifierPass();

	/// This pass inserts AVX vzeroupper instructions before each call to avoid
	/// transition penalty between functions encoded with AVX and SSE.
	FunctionPass *createX86IssueVZeroUpperPass();

	/// Return a pass that pads short functions with NOOPs.
	/// This will prevent a stall when returning on the Atom.
	FunctionPass *createX86PadShortFunctions();

	/// Return a pass that selectively replaces certain instructions (like add,
	/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA
	/// instructions, in order to eliminate execution delays in some processors.
	FunctionPass *createX86FixupLEAs();

	/// Return a pass that removes redundant LEA instructions and redundant address
	/// recalculations.
	FunctionPass *createX86OptimizeLEAs();

	/// Return a pass that transforms setcc + movzx pairs into xor + setcc.
	FunctionPass *createX86FixupSetCC();

	/// Return a pass that expands WinAlloca pseudo-instructions.
	FunctionPass *createX86WinAllocaExpander();

	/// Return a pass that optimizes the code-size of x86 call sequences. This is
	/// done by replacing esp-relative movs with pushes.
	FunctionPass *createX86CallFrameOptimization();

	/// Return an IR pass that inserts EH registration stack objects and explicit
	/// EH state updates. This pass must run after EH preparation, which does
	/// Windows-specific but architecture-neutral preparation.
	FunctionPass *createX86WinEHStatePass();

	/// Return a Machine IR pass that expands X86-specific pseudo
	/// instructions into a sequence of actual instructions. This pass
	/// must run after prologue/epilogue insertion and before lowering
	/// the MachineInstr to MC.
	FunctionPass *createX86ExpandPseudoPass();

	/// This pass converts X86 cmov instructions into branch when profitable.
	FunctionPass *createX86CmovConverterPass();

	/// Return a Machine IR pass that selectively replaces
	/// certain byte and word instructions by equivalent 32 bit instructions,
	/// in order to eliminate partial register usage, false dependences on
	/// the upper portions of registers, and to save code size.
	FunctionPass *createX86FixupBWInsts();

	/// Return a Machine IR pass that reassigns instruction chains from one domain
	/// to another, when profitable.
	FunctionPass *createX86DomainReassignmentPass();

	void initializeFixupBWInstPassPass(PassRegistry &);

	/// This pass replaces EVEX encoded of AVX-512 instructiosn by VEX
	/// encoding when possible in order to reduce code size.
	FunctionPass *createX86EvexToVexInsts();
	+
	+/// This pass creates the thunks for the retpoline feature.
	+FunctionPass *createX86RetpolineThunksPass();

	InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
	X86Subtarget &,
	X86RegisterBankInfo &);

	void initializeEvexToVexInstPassPass(PassRegistry &);

	} // End llvm namespace

	#endif
	Index: head/contrib/llvm/lib/Target/X86/X86.td
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86.td (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86.td (revision 328817)
	@@ -1,1048 +1,1069 @@
	//===-- X86.td - Target definition file for the Intel X86 --- tablegen --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This is a target description file for the Intel i386 architecture, referred
	// to here as the "X86" architecture.
	//
	//===----------------------------------------------------------------------===//

	// Get the target-independent interfaces which we are implementing...
	//
	include "llvm/Target/Target.td"

	//===----------------------------------------------------------------------===//
	// X86 Subtarget state
	//

	def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
	"64-bit mode (x86_64)">;
	def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true",
	"32-bit mode (80386)">;
	def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
	"16-bit mode (i8086)">;

	//===----------------------------------------------------------------------===//
	// X86 Subtarget features
	//===----------------------------------------------------------------------===//

	def FeatureX87 : SubtargetFeature<"x87","HasX87", "true",
	"Enable X87 float instructions">;

	def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
	"Enable conditional move instructions">;

	def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
	"Support POPCNT instruction">;

	def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true",
	"Support fxsave/fxrestore instructions">;

	def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true",
	"Support xsave instructions">;

	def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
	"Support xsaveopt instructions">;

	def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
	"Support xsavec instructions">;

	def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true",
	"Support xsaves instructions">;

	def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
	"Enable SSE instructions",
	// SSE codegen depends on cmovs, and all
	// SSE1+ processors support them.
	[FeatureCMOV]>;
	def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
	"Enable SSE2 instructions",
	[FeatureSSE1]>;
	def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
	"Enable SSE3 instructions",
	[FeatureSSE2]>;
	def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
	"Enable SSSE3 instructions",
	[FeatureSSE3]>;
	def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
	"Enable SSE 4.1 instructions",
	[FeatureSSSE3]>;
	def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
	"Enable SSE 4.2 instructions",
	[FeatureSSE41]>;
	// The MMX subtarget feature is separate from the rest of the SSE features
	// because it's important (for odd compatibility reasons) to be able to
	// turn it off explicitly while allowing SSE+ to be on.
	def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX",
	"Enable MMX instructions">;
	def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
	"Enable 3DNow! instructions",
	[FeatureMMX]>;
	def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
	"Enable 3DNow! Athlon instructions",
	[Feature3DNow]>;
	// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
	// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
	// without disabling 64-bit mode.
	def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
	"Support 64-bit instructions",
	[FeatureCMOV]>;
	def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
	"64-bit with cmpxchg16b",
	[Feature64Bit]>;
	def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
	"SHLD instruction is slow">;
	def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
	"PMULLD instruction is slow">;
	// FIXME: This should not apply to CPUs that do not have SSE.
	def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
	"IsUAMem16Slow", "true",
	"Slow unaligned 16-byte memory access">;
	def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
	"IsUAMem32Slow", "true",
	"Slow unaligned 32-byte memory access">;
	def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
	"Support SSE 4a instructions",
	[FeatureSSE3]>;

	def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
	"Enable AVX instructions",
	[FeatureSSE42]>;
	def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
	"Enable AVX2 instructions",
	[FeatureAVX]>;
	def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
	"Enable three-operand fused multiple-add",
	[FeatureAVX]>;
	def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
	"Support 16-bit floating point conversion instructions",
	[FeatureAVX]>;
	def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
	"Enable AVX-512 instructions",
	[FeatureAVX2, FeatureFMA, FeatureF16C]>;
	def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true",
	"Enable AVX-512 Exponential and Reciprocal Instructions",
	[FeatureAVX512]>;
	def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true",
	"Enable AVX-512 Conflict Detection Instructions",
	[FeatureAVX512]>;
	def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ",
	"true", "Enable AVX-512 Population Count Instructions",
	[FeatureAVX512]>;
	def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true",
	"Enable AVX-512 PreFetch Instructions",
	[FeatureAVX512]>;
	def FeaturePREFETCHWT1 : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1",
	"true",
	"Prefetch with Intent to Write and T1 Hint">;
	def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true",
	"Enable AVX-512 Doubleword and Quadword Instructions",
	[FeatureAVX512]>;
	def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true",
	"Enable AVX-512 Byte and Word Instructions",
	[FeatureAVX512]>;
	def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true",
	"Enable AVX-512 Vector Length eXtensions",
	[FeatureAVX512]>;
	def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true",
	"Enable AVX-512 Vector Byte Manipulation Instructions",
	[FeatureBWI]>;
	def FeatureVBMI2 : SubtargetFeature<"avx512vbmi2", "HasVBMI2", "true",
	"Enable AVX-512 further Vector Byte Manipulation Instructions",
	[FeatureBWI]>;
	def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true",
	"Enable AVX-512 Integer Fused Multiple-Add",
	[FeatureAVX512]>;
	def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
	"Enable protection keys">;
	def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
	"Enable AVX-512 Vector Neural Network Instructions",
	[FeatureAVX512]>;
	def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true",
	"Enable AVX-512 Bit Algorithms",
	[FeatureBWI]>;
	def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
	"Enable packed carry-less multiplication instructions",
	[FeatureSSE2]>;
	def FeatureGFNI : SubtargetFeature<"gfni", "HasGFNI", "true",
	"Enable Galois Field Arithmetic Instructions",
	[FeatureSSE2]>;
	def FeatureVPCLMULQDQ : SubtargetFeature<"vpclmulqdq", "HasVPCLMULQDQ", "true",
	"Enable vpclmulqdq instructions",
	[FeatureAVX, FeaturePCLMUL]>;
	def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
	"Enable four-operand fused multiple-add",
	[FeatureAVX, FeatureSSE4A]>;
	def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true",
	"Enable XOP instructions",
	[FeatureFMA4]>;
	def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
	"HasSSEUnalignedMem", "true",
	"Allow unaligned memory operands with SSE instructions">;
	def FeatureAES : SubtargetFeature<"aes", "HasAES", "true",
	"Enable AES instructions",
	[FeatureSSE2]>;
	def FeatureVAES : SubtargetFeature<"vaes", "HasVAES", "true",
	"Promote selected AES instructions to AVX512/AVX registers",
	[FeatureAVX, FeatureAES]>;
	def FeatureTBM : SubtargetFeature<"tbm", "HasTBM", "true",
	"Enable TBM instructions">;
	def FeatureLWP : SubtargetFeature<"lwp", "HasLWP", "true",
	"Enable LWP instructions">;
	def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true",
	"Support MOVBE instruction">;
	def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
	"Support RDRAND instruction">;
	def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
	"Support FS/GS Base instructions">;
	def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
	"Support LZCNT instruction">;
	def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true",
	"Support BMI instructions">;
	def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true",
	"Support BMI2 instructions">;
	def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true",
	"Support RTM instructions">;
	def FeatureADX : SubtargetFeature<"adx", "HasADX", "true",
	"Support ADX instructions">;
	def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true",
	"Enable SHA instructions",
	[FeatureSSE2]>;
	def FeatureSHSTK : SubtargetFeature<"shstk", "HasSHSTK", "true",
	"Support CET Shadow-Stack instructions">;
	def FeatureIBT : SubtargetFeature<"ibt", "HasIBT", "true",
	"Support CET Indirect-Branch-Tracking instructions">;
	def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
	"Support PRFCHW instructions">;
	def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
	"Support RDSEED instruction">;
	def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
	"Support LAHF and SAHF instructions">;
	def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
	"Enable MONITORX/MWAITX timer functionality">;
	def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true",
	"Enable Cache Line Zero">;
	def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true",
	"Support MPX instructions">;
	def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
	"Use LEA for adjusting the stack pointer">;
	def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
	"HasSlowDivide32", "true",
	"Use 8-bit divide for positive values less than 256">;
	def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
	"HasSlowDivide64", "true",
	"Use 32-bit divide for positive values less than 2^32">;
	def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
	"PadShortFunctions", "true",
	"Pad short functions">;
	def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true",
	"Enable Software Guard Extensions">;
	def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
	"Flush A Cache Line Optimized">;
	def FeatureCLWB : SubtargetFeature<"clwb", "HasCLWB", "true",
	"Cache Line Write Back">;
	// On some processors, instructions that implicitly take two memory operands are
	// slow. In practice, this means that CALL, PUSH, and POP with memory operands
	// should be avoided in favor of a MOV + register CALL/PUSH/POP.
	def FeatureSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops",
	"SlowTwoMemOps", "true",
	"Two memory operand instructions are slow">;
	def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
	"LEA instruction needs inputs at AG stage">;
	def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
	"LEA instruction with certain arguments is slow">;
	def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
	"LEA instruction with 3 ops or certain registers is slow">;
	def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
	"INC and DEC instructions are slower than ADD and SUB">;
	def FeatureSoftFloat
	: SubtargetFeature<"soft-float", "UseSoftFloat", "true",
	"Use software floating point features.">;
	// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
	// using a variable mask over multiple fixed shuffles.
	def FeatureFastVariableShuffle
	: SubtargetFeature<"fast-variable-shuffle",
	"HasFastVariableShuffle",
	"true", "Shuffles with variable masks are fast">;
	// On some X86 processors, there is no performance hazard to writing only the
	// lower parts of a YMM or ZMM register without clearing the upper part.
	def FeatureFastPartialYMMorZMMWrite
	: SubtargetFeature<"fast-partial-ymm-or-zmm-write",
	"HasFastPartialYMMorZMMWrite",
	"true", "Partial writes to YMM/ZMM registers are fast">;
	// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
	// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
	// vector FSQRT has higher throughput than the corresponding NR code.
	// The idea is that throughput bound code is likely to be vectorized, so for
	// vectorized code we should care about the throughput of SQRT operations.
	// But if the code is scalar that probably means that the code has some kind of
	// dependency and we should care more about reducing the latency.
	def FeatureFastScalarFSQRT
	: SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
	"true", "Scalar SQRT is fast (disable Newton-Raphson)">;
	def FeatureFastVectorFSQRT
	: SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
	"true", "Vector SQRT is fast (disable Newton-Raphson)">;
	// If lzcnt has equivalent latency/throughput to most simple integer ops, it can
	// be used to replace test/set sequences.
	def FeatureFastLZCNT
	: SubtargetFeature<
	"fast-lzcnt", "HasFastLZCNT", "true",
	"LZCNT instructions are as fast as most simple integer ops">;


	// Sandy Bridge and newer processors can use SHLD with the same source on both
	// inputs to implement rotate to avoid the partial flag update of the normal
	// rotate instructions.
	def FeatureFastSHLDRotate
	: SubtargetFeature<
	"fast-shld-rotate", "HasFastSHLDRotate", "true",
	"SHLD can be used as a faster rotate">;

	// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
	// "string operations"). See "REP String Enhancement" in the Intel Software
	// Development Manual. This feature essentially means that REP MOVSB will copy
	// using the largest available size instead of copying bytes one by one, making
	// it at least as fast as REPMOVS{W,D,Q}.
	def FeatureERMSB
	: SubtargetFeature<
	"ermsb", "HasERMSB", "true",
	"REP MOVS/STOS are fast">;

	// Sandy Bridge and newer processors have many instructions that can be
	// fused with conditional branches and pass through the CPU as a single
	// operation.
	def FeatureMacroFusion
	: SubtargetFeature<"macrofusion", "HasMacroFusion", "true",
	"Various instructions can be fused with conditional branches">;

	// Gather is available since Haswell (AVX2 set). So technically, we can
	// generate Gathers on all AVX2 processors. But the overhead on HSW is high.
	// Skylake Client processor has faster Gathers than HSW and performance is
	// similar to Skylake Server (AVX-512).
	def FeatureHasFastGather
	: SubtargetFeature<"fast-gather", "HasFastGather", "true",
	"Indicates if gather is reasonably fast.">;

	+// Enable mitigation of some aspects of speculative execution related
	+// vulnerabilities by removing speculatable indirect branches. This disables
	+// jump-table formation, rewrites explicit `indirectbr` instructions into
	+// `switch` instructions, and uses a special construct called a "retpoline" to
	+// prevent speculation of the remaining indirect branches (indirect calls and
	+// tail calls).
	+def FeatureRetpoline
	+ : SubtargetFeature<"retpoline", "UseRetpoline", "true",
	+ "Remove speculation of indirect branches from the "
	+ "generated code, either by avoiding them entirely or "
	+ "lowering them with a speculation blocking construct.">;
	+
	+// Rely on external thunks for the emitted retpoline calls. This allows users
	+// to provide their own custom thunk definitions in highly specialized
	+// environments such as a kernel that does boot-time hot patching.
	+def FeatureRetpolineExternalThunk
	+ : SubtargetFeature<
	+ "retpoline-external-thunk", "UseRetpolineExternalThunk", "true",
	+ "Enable retpoline, but with an externally provided thunk.",
	+ [FeatureRetpoline]>;
	+
	//===----------------------------------------------------------------------===//
	// Register File Description
	//===----------------------------------------------------------------------===//

	include "X86RegisterInfo.td"
	include "X86RegisterBanks.td"

	//===----------------------------------------------------------------------===//
	// Instruction Descriptions
	//===----------------------------------------------------------------------===//

	include "X86Schedule.td"
	include "X86InstrInfo.td"

	def X86InstrInfo : InstrInfo;

	//===----------------------------------------------------------------------===//
	// X86 processors supported.
	//===----------------------------------------------------------------------===//

	include "X86ScheduleAtom.td"
	include "X86SchedSandyBridge.td"
	include "X86SchedHaswell.td"
	include "X86SchedBroadwell.td"
	include "X86ScheduleSLM.td"
	include "X86ScheduleZnver1.td"
	include "X86ScheduleBtVer2.td"
	include "X86SchedSkylakeClient.td"
	include "X86SchedSkylakeServer.td"

	def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
	"Intel Atom processors">;
	def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
	"Intel Silvermont processors">;
	def ProcIntelGLM : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM",
	"Intel Goldmont processors">;
	def ProcIntelHSW : SubtargetFeature<"haswell", "X86ProcFamily",
	"IntelHaswell", "Intel Haswell processors">;
	def ProcIntelBDW : SubtargetFeature<"broadwell", "X86ProcFamily",
	"IntelBroadwell", "Intel Broadwell processors">;
	def ProcIntelSKL : SubtargetFeature<"skylake", "X86ProcFamily",
	"IntelSkylake", "Intel Skylake processors">;
	def ProcIntelKNL : SubtargetFeature<"knl", "X86ProcFamily",
	"IntelKNL", "Intel Knights Landing processors">;
	def ProcIntelSKX : SubtargetFeature<"skx", "X86ProcFamily",
	"IntelSKX", "Intel Skylake Server processors">;
	def ProcIntelCNL : SubtargetFeature<"cannonlake", "X86ProcFamily",
	"IntelCannonlake", "Intel Cannonlake processors">;
	def ProcIntelICL : SubtargetFeature<"icelake", "X86ProcFamily",
	"IntelIcelake", "Intel Icelake processors">;

	class Proc<string Name, list<SubtargetFeature> Features>
	: ProcessorModel<Name, GenericModel, Features>;

	def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16]>;
	def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>;
	def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>;
	def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16]>;
	def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16]>;
	def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;

	foreach P = ["i686", "pentiumpro"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>;
	}

	def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
	FeatureCMOV, FeatureFXSR]>;

	foreach P = ["pentium3", "pentium3m"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
	FeatureFXSR]>;
	}

	// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
	// The intent is to enable it for pentium4 which is the current default
	// processor in a vanilla 32-bit clang compilation when no specific
	// architecture is specified. This generally gives a nice performance
	// increase on silvermont, with largely neutral behavior on other
	// contemporary large core processors.
	// pentium-m, pentium4m, prescott and nocona are included as a preventative
	// measure to avoid performance surprises, in case clang's default cpu
	// changes slightly.

	def : ProcessorModel<"pentium-m", GenericPostRAModel,
	[FeatureX87, FeatureSlowUAMem16, FeatureMMX,
	FeatureSSE2, FeatureFXSR]>;

	foreach P = ["pentium4", "pentium4m"] in {
	def : ProcessorModel<P, GenericPostRAModel,
	[FeatureX87, FeatureSlowUAMem16, FeatureMMX,
	FeatureSSE2, FeatureFXSR]>;
	}

	// Intel Quark.
	def : Proc<"lakemont", []>;

	// Intel Core Duo.
	def : ProcessorModel<"yonah", SandyBridgeModel,
	[FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
	FeatureFXSR]>;

	// NetBurst.
	def : ProcessorModel<"prescott", GenericPostRAModel,
	[FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
	FeatureFXSR]>;
	def : ProcessorModel<"nocona", GenericPostRAModel, [
	FeatureX87,
	FeatureSlowUAMem16,
	FeatureMMX,
	FeatureSSE3,
	FeatureFXSR,
	FeatureCMPXCHG16B
	]>;

	// Intel Core 2 Solo/Duo.
	def : ProcessorModel<"core2", SandyBridgeModel, [
	FeatureX87,
	FeatureSlowUAMem16,
	FeatureMMX,
	FeatureSSSE3,
	FeatureFXSR,
	FeatureCMPXCHG16B,
	FeatureLAHFSAHF,
	FeatureMacroFusion
	]>;
	def : ProcessorModel<"penryn", SandyBridgeModel, [
	FeatureX87,
	FeatureSlowUAMem16,
	FeatureMMX,
	FeatureSSE41,
	FeatureFXSR,
	FeatureCMPXCHG16B,
	FeatureLAHFSAHF,
	FeatureMacroFusion
	]>;

	// Atom CPUs.
	class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
	ProcIntelAtom,
	FeatureX87,
	FeatureSlowUAMem16,
	FeatureMMX,
	FeatureSSSE3,
	FeatureFXSR,
	FeatureCMPXCHG16B,
	FeatureMOVBE,
	FeatureLEAForSP,
	FeatureSlowDivide32,
	FeatureSlowDivide64,
	FeatureSlowTwoMemOps,
	FeatureLEAUsesAG,
	FeaturePadShortFunctions,
	FeatureLAHFSAHF
	]>;
	def : BonnellProc<"bonnell">;
	def : BonnellProc<"atom">; // Pin the generic name to the baseline.

	class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
	ProcIntelSLM,
	FeatureX87,
	FeatureMMX,
	FeatureSSE42,
	FeatureFXSR,
	FeatureCMPXCHG16B,
	FeatureMOVBE,
	FeaturePOPCNT,
	FeaturePCLMUL,
	FeatureAES,
	FeatureSlowDivide64,
	FeatureSlowTwoMemOps,
	FeaturePRFCHW,
	FeatureSlowLEA,
	FeatureSlowIncDec,
	FeatureSlowPMULLD,
	FeatureLAHFSAHF
	]>;
	def : SilvermontProc<"silvermont">;
	def : SilvermontProc<"slm">; // Legacy alias.

	class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [
	ProcIntelGLM,
	FeatureX87,
	FeatureMMX,
	FeatureSSE42,
	FeatureFXSR,
	FeatureCMPXCHG16B,
	FeatureMOVBE,
	FeaturePOPCNT,
	FeaturePCLMUL,
	FeatureAES,
	FeaturePRFCHW,
	FeatureSlowTwoMemOps,
	FeatureSlowLEA,
	FeatureSlowIncDec,
	FeatureLAHFSAHF,
	FeatureMPX,
	FeatureSHA,
	FeatureRDRAND,
	FeatureRDSEED,
	FeatureXSAVE,
	FeatureXSAVEOPT,
	FeatureXSAVEC,
	FeatureXSAVES,
	FeatureCLFLUSHOPT,
	FeatureFSGSBase
	]>;
	def : GoldmontProc<"goldmont">;

	// "Arrandale" along with corei3 and corei5
	class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
	FeatureX87,
	FeatureMMX,
	FeatureSSE42,
	FeatureFXSR,
	FeatureCMPXCHG16B,
	FeaturePOPCNT,
	FeatureLAHFSAHF,
	FeatureMacroFusion
	]>;
	def : NehalemProc<"nehalem">;
	def : NehalemProc<"corei7">;

	// Westmere is a similar machine to nehalem with some additional features.
	// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
	class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
	FeatureX87,
	FeatureMMX,
	FeatureSSE42,
	FeatureFXSR,
	FeatureCMPXCHG16B,
	FeaturePOPCNT,
	FeatureAES,
	FeaturePCLMUL,
	FeatureLAHFSAHF,
	FeatureMacroFusion
	]>;
	def : WestmereProc<"westmere">;

	class ProcessorFeatures<list<SubtargetFeature> Inherited,
	list<SubtargetFeature> NewFeatures> {
	list<SubtargetFeature> Value = !listconcat(Inherited, NewFeatures);
	}

	class ProcModel<string Name, SchedMachineModel Model,
	list<SubtargetFeature> ProcFeatures,
	list<SubtargetFeature> OtherFeatures> :
	ProcessorModel<Name, Model, !listconcat(ProcFeatures, OtherFeatures)>;

	// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
	// rather than a superset.
	def SNBFeatures : ProcessorFeatures<[], [
	FeatureX87,
	FeatureMMX,
	FeatureAVX,
	FeatureFXSR,
	FeatureCMPXCHG16B,
	FeaturePOPCNT,
	FeatureAES,
	FeatureSlowDivide64,
	FeaturePCLMUL,
	FeatureXSAVE,
	FeatureXSAVEOPT,
	FeatureLAHFSAHF,
	FeatureSlow3OpsLEA,
	FeatureFastScalarFSQRT,
	FeatureFastSHLDRotate,
	FeatureSlowIncDec,
	FeatureMacroFusion
	]>;

	class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
	SNBFeatures.Value, [
	FeatureSlowUAMem32
	]>;
	def : SandyBridgeProc<"sandybridge">;
	def : SandyBridgeProc<"corei7-avx">; // Legacy alias.

	def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [
	FeatureRDRAND,
	FeatureF16C,
	FeatureFSGSBase
	]>;

	class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
	IVBFeatures.Value, [
	FeatureSlowUAMem32
	]>;
	def : IvyBridgeProc<"ivybridge">;
	def : IvyBridgeProc<"core-avx-i">; // Legacy alias.

	def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
	FeatureAVX2,
	FeatureBMI,
	FeatureBMI2,
	FeatureERMSB,
	FeatureFMA,
	FeatureLZCNT,
	FeatureMOVBE,
	FeatureFastVariableShuffle
	]>;

	class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
	HSWFeatures.Value, [
	ProcIntelHSW
	]>;
	def : HaswellProc<"haswell">;
	def : HaswellProc<"core-avx2">; // Legacy alias.

	def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
	FeatureADX,
	FeatureRDSEED,
	FeaturePRFCHW
	]>;
	class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel,
	BDWFeatures.Value, [
	ProcIntelBDW
	]>;
	def : BroadwellProc<"broadwell">;

	def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
	FeatureMPX,
	FeatureRTM,
	FeatureXSAVEC,
	FeatureXSAVES,
	FeatureSGX,
	FeatureCLFLUSHOPT,
	FeatureFastVectorFSQRT
	]>;

	class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel,
	SKLFeatures.Value, [
	ProcIntelSKL,
	FeatureHasFastGather
	]>;
	def : SkylakeClientProc<"skylake">;

	def KNLFeatures : ProcessorFeatures<IVBFeatures.Value, [
	FeatureAVX512,
	FeatureERI,
	FeatureCDI,
	FeaturePFI,
	FeaturePREFETCHWT1,
	FeatureADX,
	FeatureRDSEED,
	FeatureMOVBE,
	FeatureLZCNT,
	FeatureBMI,
	FeatureBMI2,
	FeatureFMA,
	FeaturePRFCHW
	]>;

	// FIXME: define KNL model
	class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
	KNLFeatures.Value, [
	ProcIntelKNL,
	FeatureSlowTwoMemOps,
	FeatureFastPartialYMMorZMMWrite,
	FeatureHasFastGather
	]>;
	def : KnightsLandingProc<"knl">;

	class KnightsMillProc<string Name> : ProcModel<Name, HaswellModel,
	KNLFeatures.Value, [
	ProcIntelKNL,
	FeatureSlowTwoMemOps,
	FeatureFastPartialYMMorZMMWrite,
	FeatureHasFastGather,
	FeatureVPOPCNTDQ
	]>;
	def : KnightsMillProc<"knm">; // TODO Add AVX5124FMAPS/AVX5124VNNIW features

	def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
	FeatureAVX512,
	FeatureCDI,
	FeatureDQI,
	FeatureBWI,
	FeatureVLX,
	FeaturePKU,
	FeatureCLWB
	]>;

	class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
	SKXFeatures.Value, [
	ProcIntelSKX,
	FeatureHasFastGather
	]>;
	def : SkylakeServerProc<"skylake-avx512">;
	def : SkylakeServerProc<"skx">; // Legacy alias.

	def CNLFeatures : ProcessorFeatures<SKXFeatures.Value, [
	FeatureVBMI,
	FeatureIFMA,
	FeatureSHA
	]>;

	class CannonlakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
	CNLFeatures.Value, [
	ProcIntelCNL,
	FeatureHasFastGather
	]>;
	def : CannonlakeProc<"cannonlake">;

	def ICLFeatures : ProcessorFeatures<CNLFeatures.Value, [
	FeatureBITALG,
	FeatureVAES,
	FeatureVBMI2,
	FeatureVNNI,
	FeatureVPCLMULQDQ,
	FeatureVPOPCNTDQ,
	FeatureGFNI,
	FeatureCLWB
	]>;

	class IcelakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
	ICLFeatures.Value, [
	ProcIntelICL,
	FeatureHasFastGather
	]>;
	def : IcelakeProc<"icelake">;

	// AMD CPUs.

	def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
	def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
	def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;

	foreach P = ["athlon", "athlon-tbird"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, Feature3DNowA, FeatureSlowSHLD]>;
	}

	foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
	Feature3DNowA, FeatureFXSR, FeatureSlowSHLD]>;
	}

	foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
	FeatureFXSR, Feature64Bit, FeatureSlowSHLD]>;
	}

	foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA,
	FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowSHLD]>;
	}

	foreach P = ["amdfam10", "barcelona"] in {
	def : Proc<P, [FeatureX87, FeatureSSE4A, Feature3DNowA, FeatureFXSR,
	FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
	FeatureSlowSHLD, FeatureLAHFSAHF]>;
	}

	// Bobcat
	def : Proc<"btver1", [
	FeatureX87,
	FeatureMMX,
	FeatureSSSE3,
	FeatureSSE4A,
	FeatureFXSR,
	FeatureCMPXCHG16B,
	FeaturePRFCHW,
	FeatureLZCNT,
	FeaturePOPCNT,
	FeatureSlowSHLD,
	FeatureLAHFSAHF
	]>;

	// Jaguar
	def : ProcessorModel<"btver2", BtVer2Model, [
	FeatureX87,
	FeatureMMX,
	FeatureAVX,
	FeatureFXSR,
	FeatureSSE4A,
	FeatureCMPXCHG16B,
	FeaturePRFCHW,
	FeatureAES,
	FeaturePCLMUL,
	FeatureBMI,
	FeatureF16C,
	FeatureMOVBE,
	FeatureLZCNT,
	FeatureFastLZCNT,
	FeaturePOPCNT,
	FeatureXSAVE,
	FeatureXSAVEOPT,
	FeatureSlowSHLD,
	FeatureLAHFSAHF,
	FeatureFastPartialYMMorZMMWrite
	]>;

	// Bulldozer
	def : Proc<"bdver1", [
	FeatureX87,
	FeatureXOP,
	FeatureFMA4,
	FeatureCMPXCHG16B,
	FeatureAES,
	FeaturePRFCHW,
	FeaturePCLMUL,
	FeatureMMX,
	FeatureAVX,
	FeatureFXSR,
	FeatureSSE4A,
	FeatureLZCNT,
	FeaturePOPCNT,
	FeatureXSAVE,
	FeatureLWP,
	FeatureSlowSHLD,
	FeatureLAHFSAHF,
	FeatureMacroFusion
	]>;
	// Piledriver
	def : Proc<"bdver2", [
	FeatureX87,
	FeatureXOP,
	FeatureFMA4,
	FeatureCMPXCHG16B,
	FeatureAES,
	FeaturePRFCHW,
	FeaturePCLMUL,
	FeatureMMX,
	FeatureAVX,
	FeatureFXSR,
	FeatureSSE4A,
	FeatureF16C,
	FeatureLZCNT,
	FeaturePOPCNT,
	FeatureXSAVE,
	FeatureBMI,
	FeatureTBM,
	FeatureLWP,
	FeatureFMA,
	FeatureSlowSHLD,
	FeatureLAHFSAHF,
	FeatureMacroFusion
	]>;

	// Steamroller
	def : Proc<"bdver3", [
	FeatureX87,
	FeatureXOP,
	FeatureFMA4,
	FeatureCMPXCHG16B,
	FeatureAES,
	FeaturePRFCHW,
	FeaturePCLMUL,
	FeatureMMX,
	FeatureAVX,
	FeatureFXSR,
	FeatureSSE4A,
	FeatureF16C,
	FeatureLZCNT,
	FeaturePOPCNT,
	FeatureXSAVE,
	FeatureBMI,
	FeatureTBM,
	FeatureLWP,
	FeatureFMA,
	FeatureXSAVEOPT,
	FeatureSlowSHLD,
	FeatureFSGSBase,
	FeatureLAHFSAHF,
	FeatureMacroFusion
	]>;

	// Excavator
	def : Proc<"bdver4", [
	FeatureX87,
	FeatureMMX,
	FeatureAVX2,
	FeatureFXSR,
	FeatureXOP,
	FeatureFMA4,
	FeatureCMPXCHG16B,
	FeatureAES,
	FeaturePRFCHW,
	FeaturePCLMUL,
	FeatureF16C,
	FeatureLZCNT,
	FeaturePOPCNT,
	FeatureXSAVE,
	FeatureBMI,
	FeatureBMI2,
	FeatureTBM,
	FeatureLWP,
	FeatureFMA,
	FeatureXSAVEOPT,
	FeatureSlowSHLD,
	FeatureFSGSBase,
	FeatureLAHFSAHF,
	FeatureMWAITX,
	FeatureMacroFusion
	]>;

	// Znver1
	def: ProcessorModel<"znver1", Znver1Model, [
	FeatureADX,
	FeatureAES,
	FeatureAVX2,
	FeatureBMI,
	FeatureBMI2,
	FeatureCLFLUSHOPT,
	FeatureCLZERO,
	FeatureCMPXCHG16B,
	FeatureF16C,
	FeatureFMA,
	FeatureFSGSBase,
	FeatureFXSR,
	FeatureFastLZCNT,
	FeatureLAHFSAHF,
	FeatureLZCNT,
	FeatureMacroFusion,
	FeatureMMX,
	FeatureMOVBE,
	FeatureMWAITX,
	FeaturePCLMUL,
	FeaturePOPCNT,
	FeaturePRFCHW,
	FeatureRDRAND,
	FeatureRDSEED,
	FeatureSHA,
	FeatureSSE4A,
	FeatureSlowSHLD,
	FeatureX87,
	FeatureXSAVE,
	FeatureXSAVEC,
	FeatureXSAVEOPT,
	FeatureXSAVES]>;

	def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>;

	def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
	def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
	def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
	def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
	FeatureSSE1, FeatureFXSR]>;

	// We also provide a generic 64-bit specific x86 processor model which tries to
	// be good for modern chips without enabling instruction set encodings past the
	// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
	// modern 64-bit x86 chip, and enables features that are generally beneficial.
	//
	// We currently use the Sandy Bridge model as the default scheduling model as
	// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
	// covers a huge swath of x86 processors. If there are specific scheduling
	// knobs which need to be tuned differently for AMD chips, we might consider
	// forming a common base for them.
	def : ProcessorModel<"x86-64", SandyBridgeModel, [
	FeatureX87,
	FeatureMMX,
	FeatureSSE2,
	FeatureFXSR,
	Feature64Bit,
	FeatureSlow3OpsLEA,
	FeatureSlowIncDec,
	FeatureMacroFusion
	]>;

	//===----------------------------------------------------------------------===//
	// Calling Conventions
	//===----------------------------------------------------------------------===//

	include "X86CallingConv.td"


	//===----------------------------------------------------------------------===//
	// Assembly Parser
	//===----------------------------------------------------------------------===//

	def ATTAsmParserVariant : AsmParserVariant {
	int Variant = 0;

	// Variant name.
	string Name = "att";

	// Discard comments in assembly strings.
	string CommentDelimiter = "#";

	// Recognize hard coded registers.
	string RegisterPrefix = "%";
	}

	def IntelAsmParserVariant : AsmParserVariant {
	int Variant = 1;

	// Variant name.
	string Name = "intel";

	// Discard comments in assembly strings.
	string CommentDelimiter = ";";

	// Recognize hard coded registers.
	string RegisterPrefix = "";
	}

	//===----------------------------------------------------------------------===//
	// Assembly Printers
	//===----------------------------------------------------------------------===//

	// The X86 target supports two different syntaxes for emitting machine code.
	// This is controlled by the -x86-asm-syntax={att\|intel}
	def ATTAsmWriter : AsmWriter {
	string AsmWriterClassName = "ATTInstPrinter";
	int Variant = 0;
	}
	def IntelAsmWriter : AsmWriter {
	string AsmWriterClassName = "IntelInstPrinter";
	int Variant = 1;
	}

	def X86 : Target {
	// Information about the instructions...
	let InstructionSet = X86InstrInfo;
	let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
	let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
	}
	Index: head/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86AsmPrinter.h (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86AsmPrinter.h (revision 328817)
	@@ -1,147 +1,148 @@
	//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
	#define LLVM_LIB_TARGET_X86_X86ASMPRINTER_H

	#include "X86Subtarget.h"
	#include "llvm/CodeGen/AsmPrinter.h"
	#include "llvm/CodeGen/FaultMaps.h"
	#include "llvm/CodeGen/StackMaps.h"
	#include "llvm/MC/MCCodeEmitter.h"
	#include "llvm/Target/TargetMachine.h"

	// Implemented in X86MCInstLower.cpp
	namespace {
	class X86MCInstLower;
	}

	namespace llvm {
	class MCStreamer;
	class MCSymbol;

	class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
	const X86Subtarget *Subtarget;
	StackMaps SM;
	FaultMaps FM;
	std::unique_ptr<MCCodeEmitter> CodeEmitter;
	bool EmitFPOData = false;
	+ bool NeedsRetpoline = false;

	// This utility class tracks the length of a stackmap instruction's 'shadow'.
	// It is used by the X86AsmPrinter to ensure that the stackmap shadow
	// invariants (i.e. no other stackmaps, patchpoints, or control flow within
	// the shadow) are met, while outputting a minimal number of NOPs for padding.
	//
	// To minimise the number of NOPs used, the shadow tracker counts the number
	// of instruction bytes output since the last stackmap. Only if there are too
	// few instruction bytes to cover the shadow are NOPs used for padding.
	class StackMapShadowTracker {
	public:
	void startFunction(MachineFunction &MF) {
	this->MF = &MF;
	}
	void count(MCInst &Inst, const MCSubtargetInfo &STI,
	MCCodeEmitter *CodeEmitter);

	// Called to signal the start of a shadow of RequiredSize bytes.
	void reset(unsigned RequiredSize) {
	RequiredShadowSize = RequiredSize;
	CurrentShadowSize = 0;
	InShadow = true;
	}

	// Called before every stackmap/patchpoint, and at the end of basic blocks,
	// to emit any necessary padding-NOPs.
	void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI);
	private:
	const MachineFunction *MF;
	bool InShadow = false;

	// RequiredShadowSize holds the length of the shadow specified in the most
	// recently encountered STACKMAP instruction.
	// CurrentShadowSize counts the number of bytes encoded since the most
	// recently encountered STACKMAP, stopping when that number is greater than
	// or equal to RequiredShadowSize.
	unsigned RequiredShadowSize = 0, CurrentShadowSize = 0;
	};

	StackMapShadowTracker SMShadowTracker;

	// All instructions emitted by the X86AsmPrinter should use this helper
	// method.
	//
	// This helper function invokes the SMShadowTracker on each instruction before
	// outputting it to the OutStream. This allows the shadow tracker to minimise
	// the number of NOPs used for stackmap padding.
	void EmitAndCountInstruction(MCInst &Inst);
	void LowerSTACKMAP(const MachineInstr &MI);
	void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
	void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
	void LowerFAULTING_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
	void LowerPATCHABLE_OP(const MachineInstr &MI, X86MCInstLower &MCIL);

	void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI);

	// XRay-specific lowering for X86.
	void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
	X86MCInstLower &MCIL);
	void LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL);
	void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
	void LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);

	void LowerFENTRY_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);

	// Choose between emitting .seh_ directives and .cv_fpo_ directives.
	void EmitSEHInstruction(const MachineInstr *MI);

	public:
	X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);

	StringRef getPassName() const override {
	return "X86 Assembly Printer";
	}

	const X86Subtarget &getSubtarget() const { return *Subtarget; }

	void EmitStartOfAsmFile(Module &M) override;

	void EmitEndOfAsmFile(Module &M) override;

	void EmitInstruction(const MachineInstr *MI) override;

	void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override {
	AsmPrinter::EmitBasicBlockEnd(MBB);
	SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
	}

	bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
	unsigned AsmVariant, const char *ExtraCode,
	raw_ostream &OS) override;
	bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
	unsigned AsmVariant, const char *ExtraCode,
	raw_ostream &OS) override;

	/// \brief Return the symbol for the specified constant pool entry.
	MCSymbol *GetCPISymbol(unsigned CPID) const override;

	bool doInitialization(Module &M) override {
	SMShadowTracker.reset(0);
	SM.reset();
	FM.reset();
	return AsmPrinter::doInitialization(M);
	}

	bool runOnMachineFunction(MachineFunction &F) override;
	void EmitFunctionBodyStart() override;
	void EmitFunctionBodyEnd() override;
	};

	} // end namespace llvm

	#endif
	Index: head/contrib/llvm/lib/Target/X86/X86FastISel.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86FastISel.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86FastISel.cpp (revision 328817)
	@@ -1,3998 +1,4002 @@
	//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the X86-specific support for the FastISel class. Much
	// of the target-specific code is generated by tablegen in the file
	// X86GenFastISel.inc, which is #included here.
	//
	//===----------------------------------------------------------------------===//

	#include "X86.h"
	#include "X86CallingConv.h"
	#include "X86InstrBuilder.h"
	#include "X86InstrInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86RegisterInfo.h"
	#include "X86Subtarget.h"
	#include "X86TargetMachine.h"
	#include "llvm/Analysis/BranchProbabilityInfo.h"
	#include "llvm/CodeGen/FastISel.h"
	#include "llvm/CodeGen/FunctionLoweringInfo.h"
	#include "llvm/CodeGen/MachineConstantPool.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/DebugInfo.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Target/TargetOptions.h"
	using namespace llvm;

	namespace {

	class X86FastISel final : public FastISel {
	/// Subtarget - Keep a pointer to the X86Subtarget around so that we can
	/// make the right decision when generating code for different targets.
	const X86Subtarget *Subtarget;

	/// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
	/// floating point ops.
	/// When SSE is available, use it for f32 operations.
	/// When SSE2 is available, use it for f64 operations.
	bool X86ScalarSSEf64;
	bool X86ScalarSSEf32;

	public:
	explicit X86FastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo)
	: FastISel(funcInfo, libInfo) {
	Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
	X86ScalarSSEf64 = Subtarget->hasSSE2();
	X86ScalarSSEf32 = Subtarget->hasSSE1();
	}

	bool fastSelectInstruction(const Instruction *I) override;

	/// \brief The specified machine instr operand is a vreg, and that
	/// vreg is being provided by the specified load instruction. If possible,
	/// try to fold the load as an operand to the instruction, returning true if
	/// possible.
	bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
	const LoadInst *LI) override;

	bool fastLowerArguments() override;
	bool fastLowerCall(CallLoweringInfo &CLI) override;
	bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;

	#include "X86GenFastISel.inc"

	private:
	bool X86FastEmitCompare(const Value LHS, const Value RHS, EVT VT,
	const DebugLoc &DL);

	bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
	unsigned &ResultReg, unsigned Alignment = 1);

	bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
	MachineMemOperand *MMO = nullptr, bool Aligned = false);
	bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
	X86AddressMode &AM,
	MachineMemOperand *MMO = nullptr, bool Aligned = false);

	bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
	unsigned &ResultReg);

	bool X86SelectAddress(const Value *V, X86AddressMode &AM);
	bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);

	bool X86SelectLoad(const Instruction *I);

	bool X86SelectStore(const Instruction *I);

	bool X86SelectRet(const Instruction *I);

	bool X86SelectCmp(const Instruction *I);

	bool X86SelectZExt(const Instruction *I);

	bool X86SelectSExt(const Instruction *I);

	bool X86SelectBranch(const Instruction *I);

	bool X86SelectShift(const Instruction *I);

	bool X86SelectDivRem(const Instruction *I);

	bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);

	bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);

	bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);

	bool X86SelectSelect(const Instruction *I);

	bool X86SelectTrunc(const Instruction *I);

	bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc,
	const TargetRegisterClass *RC);

	bool X86SelectFPExt(const Instruction *I);
	bool X86SelectFPTrunc(const Instruction *I);
	bool X86SelectSIToFP(const Instruction *I);

	const X86InstrInfo *getInstrInfo() const {
	return Subtarget->getInstrInfo();
	}
	const X86TargetMachine *getTargetMachine() const {
	return static_cast<const X86TargetMachine *>(&TM);
	}

	bool handleConstantAddresses(const Value *V, X86AddressMode &AM);

	unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
	unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
	unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
	unsigned fastMaterializeConstant(const Constant *C) override;

	unsigned fastMaterializeAlloca(const AllocaInst *C) override;

	unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;

	/// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
	/// computed in an SSE register, not on the X87 floating point stack.
	bool isScalarFPTypeInSSEReg(EVT VT) const {
	return (VT == MVT::f64 && X86ScalarSSEf64) \|\| // f64 is when SSE2
	(VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
	}

	bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);

	bool IsMemcpySmall(uint64_t Len);

	bool TryEmitSmallMemcpy(X86AddressMode DestAM,
	X86AddressMode SrcAM, uint64_t Len);

	bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
	const Value *Cond);

	const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
	X86AddressMode &AM);

	unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode,
	const TargetRegisterClass *RC, unsigned Op0,
	bool Op0IsKill, unsigned Op1, bool Op1IsKill,
	unsigned Op2, bool Op2IsKill, unsigned Op3,
	bool Op3IsKill);
	};

	} // end anonymous namespace.

	static std::pair<unsigned, bool>
	getX86SSEConditionCode(CmpInst::Predicate Predicate) {
	unsigned CC;
	bool NeedSwap = false;

	// SSE Condition code mapping:
	// 0 - EQ
	// 1 - LT
	// 2 - LE
	// 3 - UNORD
	// 4 - NEQ
	// 5 - NLT
	// 6 - NLE
	// 7 - ORD
	switch (Predicate) {
	default: llvm_unreachable("Unexpected predicate");
	case CmpInst::FCMP_OEQ: CC = 0; break;
	case CmpInst::FCMP_OGT: NeedSwap = true; LLVM_FALLTHROUGH;
	case CmpInst::FCMP_OLT: CC = 1; break;
	case CmpInst::FCMP_OGE: NeedSwap = true; LLVM_FALLTHROUGH;
	case CmpInst::FCMP_OLE: CC = 2; break;
	case CmpInst::FCMP_UNO: CC = 3; break;
	case CmpInst::FCMP_UNE: CC = 4; break;
	case CmpInst::FCMP_ULE: NeedSwap = true; LLVM_FALLTHROUGH;
	case CmpInst::FCMP_UGE: CC = 5; break;
	case CmpInst::FCMP_ULT: NeedSwap = true; LLVM_FALLTHROUGH;
	case CmpInst::FCMP_UGT: CC = 6; break;
	case CmpInst::FCMP_ORD: CC = 7; break;
	case CmpInst::FCMP_UEQ: CC = 8; break;
	case CmpInst::FCMP_ONE: CC = 12; break;
	}

	return std::make_pair(CC, NeedSwap);
	}

	/// \brief Adds a complex addressing mode to the given machine instr builder.
	/// Note, this will constrain the index register. If its not possible to
	/// constrain the given index register, then a new one will be created. The
	/// IndexReg field of the addressing mode will be updated to match in this case.
	const MachineInstrBuilder &
	X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,
	X86AddressMode &AM) {
	// First constrain the index register. It needs to be a GR64_NOSP.
	AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg,
	MIB->getNumOperands() +
	X86::AddrIndexReg);
	return ::addFullAddress(MIB, AM);
	}

	/// \brief Check if it is possible to fold the condition from the XALU intrinsic
	/// into the user. The condition code will only be updated on success.
	bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
	const Value *Cond) {
	if (!isa<ExtractValueInst>(Cond))
	return false;

	const auto *EV = cast<ExtractValueInst>(Cond);
	if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
	return false;

	const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
	MVT RetVT;
	const Function *Callee = II->getCalledFunction();
	Type *RetTy =
	cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
	if (!isTypeLegal(RetTy, RetVT))
	return false;

	if (RetVT != MVT::i32 && RetVT != MVT::i64)
	return false;

	X86::CondCode TmpCC;
	switch (II->getIntrinsicID()) {
	default: return false;
	case Intrinsic::sadd_with_overflow:
	case Intrinsic::ssub_with_overflow:
	case Intrinsic::smul_with_overflow:
	case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
	case Intrinsic::uadd_with_overflow:
	case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
	}

	// Check if both instructions are in the same basic block.
	if (II->getParent() != I->getParent())
	return false;

	// Make sure nothing is in the way
	BasicBlock::const_iterator Start(I);
	BasicBlock::const_iterator End(II);
	for (auto Itr = std::prev(Start); Itr != End; --Itr) {
	// We only expect extractvalue instructions between the intrinsic and the
	// instruction to be selected.
	if (!isa<ExtractValueInst>(Itr))
	return false;

	// Check that the extractvalue operand comes from the intrinsic.
	const auto *EVI = cast<ExtractValueInst>(Itr);
	if (EVI->getAggregateOperand() != II)
	return false;
	}

	CC = TmpCC;
	return true;
	}

	bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
	EVT evt = TLI.getValueType(DL, Ty, /HandleUnknown=/true);
	if (evt == MVT::Other \|\| !evt.isSimple())
	// Unhandled type. Halt "fast" selection and bail.
	return false;

	VT = evt.getSimpleVT();
	// For now, require SSE/SSE2 for performing floating-point operations,
	// since x87 requires additional work.
	if (VT == MVT::f64 && !X86ScalarSSEf64)
	return false;
	if (VT == MVT::f32 && !X86ScalarSSEf32)
	return false;
	// Similarly, no f80 support yet.
	if (VT == MVT::f80)
	return false;
	// We only handle legal types. For example, on x86-32 the instruction
	// selector contains all of the 64-bit instructions from x86-64,
	// under the assumption that i64 won't be used if the target doesn't
	// support it.
	return (AllowI1 && VT == MVT::i1) \|\| TLI.isTypeLegal(VT);
	}

	#include "X86GenCallingConv.inc"

	/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
	/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
	/// Return true and the result register by reference if it is possible.
	bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
	MachineMemOperand *MMO, unsigned &ResultReg,
	unsigned Alignment) {
	bool HasSSE41 = Subtarget->hasSSE41();
	bool HasAVX = Subtarget->hasAVX();
	bool HasAVX2 = Subtarget->hasAVX2();
	bool HasAVX512 = Subtarget->hasAVX512();
	bool HasVLX = Subtarget->hasVLX();
	bool IsNonTemporal = MMO && MMO->isNonTemporal();

	// Get opcode and regclass of the output for the given load instruction.
	unsigned Opc = 0;
	const TargetRegisterClass *RC = nullptr;
	switch (VT.getSimpleVT().SimpleTy) {
	default: return false;
	case MVT::i1:
	case MVT::i8:
	Opc = X86::MOV8rm;
	RC = &X86::GR8RegClass;
	break;
	case MVT::i16:
	Opc = X86::MOV16rm;
	RC = &X86::GR16RegClass;
	break;
	case MVT::i32:
	Opc = X86::MOV32rm;
	RC = &X86::GR32RegClass;
	break;
	case MVT::i64:
	// Must be in x86-64 mode.
	Opc = X86::MOV64rm;
	RC = &X86::GR64RegClass;
	break;
	case MVT::f32:
	if (X86ScalarSSEf32) {
	Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
	RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass;
	} else {
	Opc = X86::LD_Fp32m;
	RC = &X86::RFP32RegClass;
	}
	break;
	case MVT::f64:
	if (X86ScalarSSEf64) {
	Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm;
	RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass;
	} else {
	Opc = X86::LD_Fp64m;
	RC = &X86::RFP64RegClass;
	}
	break;
	case MVT::f80:
	// No f80 support yet.
	return false;
	case MVT::v4f32:
	if (IsNonTemporal && Alignment >= 16 && HasSSE41)
	Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
	HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
	else if (Alignment >= 16)
	Opc = HasVLX ? X86::VMOVAPSZ128rm :
	HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
	else
	Opc = HasVLX ? X86::VMOVUPSZ128rm :
	HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
	RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass;
	break;
	case MVT::v2f64:
	if (IsNonTemporal && Alignment >= 16 && HasSSE41)
	Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
	HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
	else if (Alignment >= 16)
	Opc = HasVLX ? X86::VMOVAPDZ128rm :
	HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
	else
	Opc = HasVLX ? X86::VMOVUPDZ128rm :
	HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
	RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass;
	break;
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v8i16:
	case MVT::v16i8:
	if (IsNonTemporal && Alignment >= 16)
	Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
	HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
	else if (Alignment >= 16)
	Opc = HasVLX ? X86::VMOVDQA64Z128rm :
	HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
	else
	Opc = HasVLX ? X86::VMOVDQU64Z128rm :
	HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
	RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass;
	break;
	case MVT::v8f32:
	assert(HasAVX);
	if (IsNonTemporal && Alignment >= 32 && HasAVX2)
	Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
	else if (IsNonTemporal && Alignment >= 16)
	return false; // Force split for X86::VMOVNTDQArm
	else if (Alignment >= 32)
	Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
	else
	Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;
	RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass;
	break;
	case MVT::v4f64:
	assert(HasAVX);
	if (IsNonTemporal && Alignment >= 32 && HasAVX2)
	Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
	else if (IsNonTemporal && Alignment >= 16)
	return false; // Force split for X86::VMOVNTDQArm
	else if (Alignment >= 32)
	Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
	else
	Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;
	RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass;
	break;
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v16i16:
	case MVT::v32i8:
	assert(HasAVX);
	if (IsNonTemporal && Alignment >= 32 && HasAVX2)
	Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
	else if (IsNonTemporal && Alignment >= 16)
	return false; // Force split for X86::VMOVNTDQArm
	else if (Alignment >= 32)
	Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
	else
	Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;
	RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass;
	break;
	case MVT::v16f32:
	assert(HasAVX512);
	if (IsNonTemporal && Alignment >= 64)
	Opc = X86::VMOVNTDQAZrm;
	else
	Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;
	RC = &X86::VR512RegClass;
	break;
	case MVT::v8f64:
	assert(HasAVX512);
	if (IsNonTemporal && Alignment >= 64)
	Opc = X86::VMOVNTDQAZrm;
	else
	Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;
	RC = &X86::VR512RegClass;
	break;
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8:
	assert(HasAVX512);
	// Note: There are a lot more choices based on type with AVX-512, but
	// there's really no advantage when the load isn't masked.
	if (IsNonTemporal && Alignment >= 64)
	Opc = X86::VMOVNTDQAZrm;
	else
	Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;
	RC = &X86::VR512RegClass;
	break;
	}

	ResultReg = createResultReg(RC);
	MachineInstrBuilder MIB =
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
	addFullAddress(MIB, AM);
	if (MMO)
	MIB->addMemOperand(*FuncInfo.MF, MMO);
	return true;
	}

	/// X86FastEmitStore - Emit a machine instruction to store a value Val of
	/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
	/// and a displacement offset, or a GlobalAddress,
	/// i.e. V. Return true if it is possible.
	bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
	X86AddressMode &AM,
	MachineMemOperand *MMO, bool Aligned) {
	bool HasSSE1 = Subtarget->hasSSE1();
	bool HasSSE2 = Subtarget->hasSSE2();
	bool HasSSE4A = Subtarget->hasSSE4A();
	bool HasAVX = Subtarget->hasAVX();
	bool HasAVX512 = Subtarget->hasAVX512();
	bool HasVLX = Subtarget->hasVLX();
	bool IsNonTemporal = MMO && MMO->isNonTemporal();

	// Get opcode and regclass of the output for the given store instruction.
	unsigned Opc = 0;
	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f80: // No f80 support yet.
	default: return false;
	case MVT::i1: {
	// Mask out all but lowest bit.
	unsigned AndResult = createResultReg(&X86::GR8RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(X86::AND8ri), AndResult)
	.addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
	ValReg = AndResult;
	LLVM_FALLTHROUGH; // handle i1 as i8.
	}
	case MVT::i8: Opc = X86::MOV8mr; break;
	case MVT::i16: Opc = X86::MOV16mr; break;
	case MVT::i32:
	Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr;
	break;
	case MVT::i64:
	// Must be in x86-64 mode.
	Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
	break;
	case MVT::f32:
	if (X86ScalarSSEf32) {
	if (IsNonTemporal && HasSSE4A)
	Opc = X86::MOVNTSS;
	else
	Opc = HasAVX512 ? X86::VMOVSSZmr :
	HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
	} else
	Opc = X86::ST_Fp32m;
	break;
	case MVT::f64:
	if (X86ScalarSSEf32) {
	if (IsNonTemporal && HasSSE4A)
	Opc = X86::MOVNTSD;
	else
	Opc = HasAVX512 ? X86::VMOVSDZmr :
	HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
	} else
	Opc = X86::ST_Fp64m;
	break;
	case MVT::x86mmx:
	Opc = (IsNonTemporal && HasSSE1) ? X86::MMX_MOVNTQmr : X86::MMX_MOVQ64mr;
	break;
	case MVT::v4f32:
	if (Aligned) {
	if (IsNonTemporal)
	Opc = HasVLX ? X86::VMOVNTPSZ128mr :
	HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
	else
	Opc = HasVLX ? X86::VMOVAPSZ128mr :
	HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
	} else
	Opc = HasVLX ? X86::VMOVUPSZ128mr :
	HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
	break;
	case MVT::v2f64:
	if (Aligned) {
	if (IsNonTemporal)
	Opc = HasVLX ? X86::VMOVNTPDZ128mr :
	HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
	else
	Opc = HasVLX ? X86::VMOVAPDZ128mr :
	HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
	} else
	Opc = HasVLX ? X86::VMOVUPDZ128mr :
	HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
	break;
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v8i16:
	case MVT::v16i8:
	if (Aligned) {
	if (IsNonTemporal)
	Opc = HasVLX ? X86::VMOVNTDQZ128mr :
	HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
	else
	Opc = HasVLX ? X86::VMOVDQA64Z128mr :
	HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
	} else
	Opc = HasVLX ? X86::VMOVDQU64Z128mr :
	HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;
	break;
	case MVT::v8f32:
	assert(HasAVX);
	if (Aligned) {
	if (IsNonTemporal)
	Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr;
	else
	Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr;
	} else
	Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr;
	break;
	case MVT::v4f64:
	assert(HasAVX);
	if (Aligned) {
	if (IsNonTemporal)
	Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr;
	else
	Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr;
	} else
	Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr;
	break;
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v16i16:
	case MVT::v32i8:
	assert(HasAVX);
	if (Aligned) {
	if (IsNonTemporal)
	Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr;
	else
	Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr;
	} else
	Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr;
	break;
	case MVT::v16f32:
	assert(HasAVX512);
	if (Aligned)
	Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr;
	else
	Opc = X86::VMOVUPSZmr;
	break;
	case MVT::v8f64:
	assert(HasAVX512);
	if (Aligned) {
	Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr;
	} else
	Opc = X86::VMOVUPDZmr;
	break;
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8:
	assert(HasAVX512);
	// Note: There are a lot more choices based on type with AVX-512, but
	// there's really no advantage when the store isn't masked.
	if (Aligned)
	Opc = IsNonTemporal ? X86::VMOVNTDQZmr : X86::VMOVDQA64Zmr;
	else
	Opc = X86::VMOVDQU64Zmr;
	break;
	}

	const MCInstrDesc &Desc = TII.get(Opc);
	// Some of the instructions in the previous switch use FR128 instead
	// of FR32 for ValReg. Make sure the register we feed the instruction
	// matches its register class constraints.
	// Note: This is fine to do a copy from FR32 to FR128, this is the
	// same registers behind the scene and actually why it did not trigger
	// any bugs before.
	ValReg = constrainOperandRegClass(Desc, ValReg, Desc.getNumOperands() - 1);
	MachineInstrBuilder MIB =
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, Desc);
	addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
	if (MMO)
	MIB->addMemOperand(*FuncInfo.MF, MMO);

	return true;
	}

	bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
	X86AddressMode &AM,
	MachineMemOperand *MMO, bool Aligned) {
	// Handle 'null' like i32/i64 0.
	if (isa<ConstantPointerNull>(Val))
	Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));

	// If this is a store of a simple constant, fold the constant into the store.
	if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
	unsigned Opc = 0;
	bool Signed = true;
	switch (VT.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::i1:
	Signed = false;
	LLVM_FALLTHROUGH; // Handle as i8.
	case MVT::i8: Opc = X86::MOV8mi; break;
	case MVT::i16: Opc = X86::MOV16mi; break;
	case MVT::i32: Opc = X86::MOV32mi; break;
	case MVT::i64:
	// Must be a 32-bit sign extended value.
	if (isInt<32>(CI->getSExtValue()))
	Opc = X86::MOV64mi32;
	break;
	}

	if (Opc) {
	MachineInstrBuilder MIB =
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
	addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
	: CI->getZExtValue());
	if (MMO)
	MIB->addMemOperand(*FuncInfo.MF, MMO);
	return true;
	}
	}

	unsigned ValReg = getRegForValue(Val);
	if (ValReg == 0)
	return false;

	bool ValKill = hasTrivialKill(Val);
	return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
	}

	/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
	/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
	/// ISD::SIGN_EXTEND).
	bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
	unsigned Src, EVT SrcVT,
	unsigned &ResultReg) {
	unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
	Src, /TODO: Kill=/false);
	if (RR == 0)
	return false;

	ResultReg = RR;
	return true;
	}

	bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
	// Handle constant address.
	if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
	// Can't handle alternate code models yet.
	if (TM.getCodeModel() != CodeModel::Small)
	return false;

	// Can't handle TLS yet.
	if (GV->isThreadLocal())
	return false;

	// RIP-relative addresses can't have additional register operands, so if
	// we've already folded stuff into the addressing mode, just force the
	// global value into its own register, which we can use as the basereg.
	if (!Subtarget->isPICStyleRIPRel() \|\|
	(AM.Base.Reg == 0 && AM.IndexReg == 0)) {
	// Okay, we've committed to selecting this global. Set up the address.
	AM.GV = GV;

	// Allow the subtarget to classify the global.
	unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);

	// If this reference is relative to the pic base, set it now.
	if (isGlobalRelativeToPICBase(GVFlags)) {
	// FIXME: How do we know Base.Reg is free??
	AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
	}

	// Unless the ABI requires an extra load, return a direct reference to
	// the global.
	if (!isGlobalStubReference(GVFlags)) {
	if (Subtarget->isPICStyleRIPRel()) {
	// Use rip-relative addressing if we can. Above we verified that the
	// base and index registers are unused.
	assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
	AM.Base.Reg = X86::RIP;
	}
	AM.GVOpFlags = GVFlags;
	return true;
	}

	// Ok, we need to do a load from a stub. If we've already loaded from
	// this stub, reuse the loaded pointer, otherwise emit the load now.
	DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
	unsigned LoadReg;
	if (I != LocalValueMap.end() && I->second != 0) {
	LoadReg = I->second;
	} else {
	// Issue load from stub.
	unsigned Opc = 0;
	const TargetRegisterClass *RC = nullptr;
	X86AddressMode StubAM;
	StubAM.Base.Reg = AM.Base.Reg;
	StubAM.GV = GV;
	StubAM.GVOpFlags = GVFlags;

	// Prepare for inserting code in the local-value area.
	SavePoint SaveInsertPt = enterLocalValueArea();

	if (TLI.getPointerTy(DL) == MVT::i64) {
	Opc = X86::MOV64rm;
	RC = &X86::GR64RegClass;

	if (Subtarget->isPICStyleRIPRel())
	StubAM.Base.Reg = X86::RIP;
	} else {
	Opc = X86::MOV32rm;
	RC = &X86::GR32RegClass;
	}

	LoadReg = createResultReg(RC);
	MachineInstrBuilder LoadMI =
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
	addFullAddress(LoadMI, StubAM);

	// Ok, back to normal mode.
	leaveLocalValueArea(SaveInsertPt);

	// Prevent loading GV stub multiple times in same MBB.
	LocalValueMap[V] = LoadReg;
	}

	// Now construct the final address. Note that the Disp, Scale,
	// and Index values may already be set here.
	AM.Base.Reg = LoadReg;
	AM.GV = nullptr;
	return true;
	}
	}

	// If all else fails, try to materialize the value in a register.
	if (!AM.GV \|\| !Subtarget->isPICStyleRIPRel()) {
	if (AM.Base.Reg == 0) {
	AM.Base.Reg = getRegForValue(V);
	return AM.Base.Reg != 0;
	}
	if (AM.IndexReg == 0) {
	assert(AM.Scale == 1 && "Scale with no index!");
	AM.IndexReg = getRegForValue(V);
	return AM.IndexReg != 0;
	}
	}

	return false;
	}

	/// X86SelectAddress - Attempt to fill in an address from the given value.
	///
	bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
	SmallVector<const Value *, 32> GEPs;
	redo_gep:
	const User *U = nullptr;
	unsigned Opcode = Instruction::UserOp1;
	if (const Instruction *I = dyn_cast<Instruction>(V)) {
	// Don't walk into other basic blocks; it's possible we haven't
	// visited them yet, so the instructions may not yet be assigned
	// virtual registers.
	if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) \|\|
	FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
	Opcode = I->getOpcode();
	U = I;
	}
	} else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
	Opcode = C->getOpcode();
	U = C;
	}

	if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
	if (Ty->getAddressSpace() > 255)
	// Fast instruction selection doesn't support the special
	// address spaces.
	return false;

	switch (Opcode) {
	default: break;
	case Instruction::BitCast:
	// Look past bitcasts.
	return X86SelectAddress(U->getOperand(0), AM);

	case Instruction::IntToPtr:
	// Look past no-op inttoptrs.
	if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
	TLI.getPointerTy(DL))
	return X86SelectAddress(U->getOperand(0), AM);
	break;

	case Instruction::PtrToInt:
	// Look past no-op ptrtoints.
	if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
	return X86SelectAddress(U->getOperand(0), AM);
	break;

	case Instruction::Alloca: {
	// Do static allocas.
	const AllocaInst *A = cast<AllocaInst>(V);
	DenseMap<const AllocaInst *, int>::iterator SI =
	FuncInfo.StaticAllocaMap.find(A);
	if (SI != FuncInfo.StaticAllocaMap.end()) {
	AM.BaseType = X86AddressMode::FrameIndexBase;
	AM.Base.FrameIndex = SI->second;
	return true;
	}
	break;
	}

	case Instruction::Add: {
	// Adds of constants are common and easy enough.
	if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
	uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
	// They have to fit in the 32-bit signed displacement field though.
	if (isInt<32>(Disp)) {
	AM.Disp = (uint32_t)Disp;
	return X86SelectAddress(U->getOperand(0), AM);
	}
	}
	break;
	}

	case Instruction::GetElementPtr: {
	X86AddressMode SavedAM = AM;

	// Pattern-match simple GEPs.
	uint64_t Disp = (int32_t)AM.Disp;
	unsigned IndexReg = AM.IndexReg;
	unsigned Scale = AM.Scale;
	gep_type_iterator GTI = gep_type_begin(U);
	// Iterate through the indices, folding what we can. Constants can be
	// folded, and one dynamic index can be handled, if the scale is supported.
	for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
	i != e; ++i, ++GTI) {
	const Value Op = i;
	if (StructType *STy = GTI.getStructTypeOrNull()) {
	const StructLayout *SL = DL.getStructLayout(STy);
	Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
	continue;
	}

	// A array/variable index is always of the form i*S where S is the
	// constant scale size. See if we can push the scale into immediates.
	uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
	for (;;) {
	if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
	// Constant-offset addressing.
	Disp += CI->getSExtValue() * S;
	break;
	}
	if (canFoldAddIntoGEP(U, Op)) {
	// A compatible add with a constant operand. Fold the constant.
	ConstantInt *CI =
	cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
	Disp += CI->getSExtValue() * S;
	// Iterate on the other operand.
	Op = cast<AddOperator>(Op)->getOperand(0);
	continue;
	}
	if (IndexReg == 0 &&
	(!AM.GV \|\| !Subtarget->isPICStyleRIPRel()) &&
	(S == 1 \|\| S == 2 \|\| S == 4 \|\| S == 8)) {
	// Scaled-index addressing.
	Scale = S;
	IndexReg = getRegForGEPIndex(Op).first;
	if (IndexReg == 0)
	return false;
	break;
	}
	// Unsupported.
	goto unsupported_gep;
	}
	}

	// Check for displacement overflow.
	if (!isInt<32>(Disp))
	break;

	AM.IndexReg = IndexReg;
	AM.Scale = Scale;
	AM.Disp = (uint32_t)Disp;
	GEPs.push_back(V);

	if (const GetElementPtrInst *GEP =
	dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
	// Ok, the GEP indices were covered by constant-offset and scaled-index
	// addressing. Update the address state and move on to examining the base.
	V = GEP;
	goto redo_gep;
	} else if (X86SelectAddress(U->getOperand(0), AM)) {
	return true;
	}

	// If we couldn't merge the gep value into this addr mode, revert back to
	// our address and just match the value instead of completely failing.
	AM = SavedAM;

	for (const Value *I : reverse(GEPs))
	if (handleConstantAddresses(I, AM))
	return true;

	return false;
	unsupported_gep:
	// Ok, the GEP indices weren't all covered.
	break;
	}
	}

	return handleConstantAddresses(V, AM);
	}

	/// X86SelectCallAddress - Attempt to fill in an address from the given value.
	///
	bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
	const User *U = nullptr;
	unsigned Opcode = Instruction::UserOp1;
	const Instruction *I = dyn_cast<Instruction>(V);
	// Record if the value is defined in the same basic block.
	//
	// This information is crucial to know whether or not folding an
	// operand is valid.
	// Indeed, FastISel generates or reuses a virtual register for all
	// operands of all instructions it selects. Obviously, the definition and
	// its uses must use the same virtual register otherwise the produced
	// code is incorrect.
	// Before instruction selection, FunctionLoweringInfo::set sets the virtual
	// registers for values that are alive across basic blocks. This ensures
	// that the values are consistently set between across basic block, even
	// if different instruction selection mechanisms are used (e.g., a mix of
	// SDISel and FastISel).
	// For values local to a basic block, the instruction selection process
	// generates these virtual registers with whatever method is appropriate
	// for its needs. In particular, FastISel and SDISel do not share the way
	// local virtual registers are set.
	// Therefore, this is impossible (or at least unsafe) to share values
	// between basic blocks unless they use the same instruction selection
	// method, which is not guarantee for X86.
	// Moreover, things like hasOneUse could not be used accurately, if we
	// allow to reference values across basic blocks whereas they are not
	// alive across basic blocks initially.
	bool InMBB = true;
	if (I) {
	Opcode = I->getOpcode();
	U = I;
	InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
	} else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
	Opcode = C->getOpcode();
	U = C;
	}

	switch (Opcode) {
	default: break;
	case Instruction::BitCast:
	// Look past bitcasts if its operand is in the same BB.
	if (InMBB)
	return X86SelectCallAddress(U->getOperand(0), AM);
	break;

	case Instruction::IntToPtr:
	// Look past no-op inttoptrs if its operand is in the same BB.
	if (InMBB &&
	TLI.getValueType(DL, U->getOperand(0)->getType()) ==
	TLI.getPointerTy(DL))
	return X86SelectCallAddress(U->getOperand(0), AM);
	break;

	case Instruction::PtrToInt:
	// Look past no-op ptrtoints if its operand is in the same BB.
	if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
	return X86SelectCallAddress(U->getOperand(0), AM);
	break;
	}

	// Handle constant address.
	if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
	// Can't handle alternate code models yet.
	if (TM.getCodeModel() != CodeModel::Small)
	return false;

	// RIP-relative addresses can't have additional register operands.
	if (Subtarget->isPICStyleRIPRel() &&
	(AM.Base.Reg != 0 \|\| AM.IndexReg != 0))
	return false;

	// Can't handle TLS.
	if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
	if (GVar->isThreadLocal())
	return false;

	// Okay, we've committed to selecting this global. Set up the basic address.
	AM.GV = GV;

	// Return a direct reference to the global. Fastisel can handle calls to
	// functions that require loads, such as dllimport and nonlazybind
	// functions.
	if (Subtarget->isPICStyleRIPRel()) {
	// Use rip-relative addressing if we can. Above we verified that the
	// base and index registers are unused.
	assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
	AM.Base.Reg = X86::RIP;
	} else {
	AM.GVOpFlags = Subtarget->classifyLocalReference(nullptr);
	}

	return true;
	}

	// If all else fails, try to materialize the value in a register.
	if (!AM.GV \|\| !Subtarget->isPICStyleRIPRel()) {
	if (AM.Base.Reg == 0) {
	AM.Base.Reg = getRegForValue(V);
	return AM.Base.Reg != 0;
	}
	if (AM.IndexReg == 0) {
	assert(AM.Scale == 1 && "Scale with no index!");
	AM.IndexReg = getRegForValue(V);
	return AM.IndexReg != 0;
	}
	}

	return false;
	}


	/// X86SelectStore - Select and emit code to implement store instructions.
	bool X86FastISel::X86SelectStore(const Instruction *I) {
	// Atomic stores need special handling.
	const StoreInst *S = cast<StoreInst>(I);

	if (S->isAtomic())
	return false;

	const Value *PtrV = I->getOperand(1);
	if (TLI.supportSwiftError()) {
	// Swifterror values can come from either a function parameter with
	// swifterror attribute or an alloca with swifterror attribute.
	if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
	if (Arg->hasSwiftErrorAttr())
	return false;
	}

	if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
	if (Alloca->isSwiftError())
	return false;
	}
	}

	const Value *Val = S->getValueOperand();
	const Value *Ptr = S->getPointerOperand();

	MVT VT;
	if (!isTypeLegal(Val->getType(), VT, /AllowI1=/true))
	return false;

	unsigned Alignment = S->getAlignment();
	unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = ABIAlignment;
	bool Aligned = Alignment >= ABIAlignment;

	X86AddressMode AM;
	if (!X86SelectAddress(Ptr, AM))
	return false;

	return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
	}

	/// X86SelectRet - Select and emit code to implement ret instructions.
	bool X86FastISel::X86SelectRet(const Instruction *I) {
	const ReturnInst *Ret = cast<ReturnInst>(I);
	const Function &F = *I->getParent()->getParent();
	const X86MachineFunctionInfo *X86MFInfo =
	FuncInfo.MF->getInfo<X86MachineFunctionInfo>();

	if (!FuncInfo.CanLowerReturn)
	return false;

	if (TLI.supportSwiftError() &&
	F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
	return false;

	if (TLI.supportSplitCSR(FuncInfo.MF))
	return false;

	CallingConv::ID CC = F.getCallingConv();
	if (CC != CallingConv::C &&
	CC != CallingConv::Fast &&
	CC != CallingConv::X86_FastCall &&
	CC != CallingConv::X86_StdCall &&
	CC != CallingConv::X86_ThisCall &&
	CC != CallingConv::X86_64_SysV &&
	CC != CallingConv::Win64)
	return false;

	// Don't handle popping bytes if they don't fit the ret's immediate.
	if (!isUInt<16>(X86MFInfo->getBytesToPopOnReturn()))
	return false;

	// fastcc with -tailcallopt is intended to provide a guaranteed
	// tail call optimization. Fastisel doesn't know how to do that.
	if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
	return false;

	// Let SDISel handle vararg functions.
	if (F.isVarArg())
	return false;

	// Build a list of return value registers.
	SmallVector<unsigned, 4> RetRegs;

	if (Ret->getNumOperands() > 0) {
	SmallVector<ISD::OutputArg, 4> Outs;
	GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ValLocs;
	CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC_X86);

	const Value *RV = Ret->getOperand(0);
	unsigned Reg = getRegForValue(RV);
	if (Reg == 0)
	return false;

	// Only handle a single return value for now.
	if (ValLocs.size() != 1)
	return false;

	CCValAssign &VA = ValLocs[0];

	// Don't bother handling odd stuff for now.
	if (VA.getLocInfo() != CCValAssign::Full)
	return false;
	// Only handle register returns for now.
	if (!VA.isRegLoc())
	return false;

	// The calling-convention tables for x87 returns don't tell
	// the whole story.
	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
	return false;

	unsigned SrcReg = Reg + VA.getValNo();
	EVT SrcVT = TLI.getValueType(DL, RV->getType());
	EVT DstVT = VA.getValVT();
	// Special handling for extended integers.
	if (SrcVT != DstVT) {
	if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
	return false;

	if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
	return false;

	assert(DstVT == MVT::i32 && "X86 should always ext to i32");

	if (SrcVT == MVT::i1) {
	if (Outs[0].Flags.isSExt())
	return false;
	SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /TODO: Kill=/false);
	SrcVT = MVT::i8;
	}
	unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
	ISD::SIGN_EXTEND;
	SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
	SrcReg, /TODO: Kill=/false);
	}

	// Make the copy.
	unsigned DstReg = VA.getLocReg();
	const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
	// Avoid a cross-class copy. This is very unlikely.
	if (!SrcRC->contains(DstReg))
	return false;
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);

	// Add register to return instruction.
	RetRegs.push_back(VA.getLocReg());
	}

	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

	// All x86 ABIs require that for returning structs by value we copy
	// the sret argument into %rax/%eax (depending on ABI) for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into %rax/%eax.
	if (F.hasStructRetAttr() && CC != CallingConv::Swift) {
	unsigned Reg = X86MFInfo->getSRetReturnReg();
	assert(Reg &&
	"SRetReturnReg should have been set in LowerFormalArguments()!");
	unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
	RetRegs.push_back(RetReg);
	}

	// Now emit the RET.
	MachineInstrBuilder MIB;
	if (X86MFInfo->getBytesToPopOnReturn()) {
	MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Subtarget->is64Bit() ? X86::RETIQ : X86::RETIL))
	.addImm(X86MFInfo->getBytesToPopOnReturn());
	} else {
	MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
	}
	for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
	MIB.addReg(RetRegs[i], RegState::Implicit);
	return true;
	}

	/// X86SelectLoad - Select and emit code to implement load instructions.
	///
	bool X86FastISel::X86SelectLoad(const Instruction *I) {
	const LoadInst *LI = cast<LoadInst>(I);

	// Atomic loads need special handling.
	if (LI->isAtomic())
	return false;

	const Value *SV = I->getOperand(0);
	if (TLI.supportSwiftError()) {
	// Swifterror values can come from either a function parameter with
	// swifterror attribute or an alloca with swifterror attribute.
	if (const Argument *Arg = dyn_cast<Argument>(SV)) {
	if (Arg->hasSwiftErrorAttr())
	return false;
	}

	if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
	if (Alloca->isSwiftError())
	return false;
	}
	}

	MVT VT;
	if (!isTypeLegal(LI->getType(), VT, /AllowI1=/true))
	return false;

	const Value *Ptr = LI->getPointerOperand();

	X86AddressMode AM;
	if (!X86SelectAddress(Ptr, AM))
	return false;

	unsigned Alignment = LI->getAlignment();
	unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType());
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = ABIAlignment;

	unsigned ResultReg = 0;
	if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,
	Alignment))
	return false;

	updateValueMap(I, ResultReg);
	return true;
	}

	static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
	bool HasAVX512 = Subtarget->hasAVX512();
	bool HasAVX = Subtarget->hasAVX();
	bool X86ScalarSSEf32 = Subtarget->hasSSE1();
	bool X86ScalarSSEf64 = Subtarget->hasSSE2();

	switch (VT.getSimpleVT().SimpleTy) {
	default: return 0;
	case MVT::i8: return X86::CMP8rr;
	case MVT::i16: return X86::CMP16rr;
	case MVT::i32: return X86::CMP32rr;
	case MVT::i64: return X86::CMP64rr;
	case MVT::f32:
	return X86ScalarSSEf32
	? (HasAVX512 ? X86::VUCOMISSZrr
	: HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr)
	: 0;
	case MVT::f64:
	return X86ScalarSSEf64
	? (HasAVX512 ? X86::VUCOMISDZrr
	: HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr)
	: 0;
	}
	}

	/// If we have a comparison with RHS as the RHS of the comparison, return an
	/// opcode that works for the compare (e.g. CMP32ri) otherwise return 0.
	static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
	int64_t Val = RHSC->getSExtValue();
	switch (VT.getSimpleVT().SimpleTy) {
	// Otherwise, we can't fold the immediate into this comparison.
	default:
	return 0;
	case MVT::i8:
	return X86::CMP8ri;
	case MVT::i16:
	if (isInt<8>(Val))
	return X86::CMP16ri8;
	return X86::CMP16ri;
	case MVT::i32:
	if (isInt<8>(Val))
	return X86::CMP32ri8;
	return X86::CMP32ri;
	case MVT::i64:
	if (isInt<8>(Val))
	return X86::CMP64ri8;
	// 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
	// field.
	if (isInt<32>(Val))
	return X86::CMP64ri32;
	return 0;
	}
	}

	bool X86FastISel::X86FastEmitCompare(const Value Op0, const Value Op1, EVT VT,
	const DebugLoc &CurDbgLoc) {
	unsigned Op0Reg = getRegForValue(Op0);
	if (Op0Reg == 0) return false;

	// Handle 'null' like i32/i64 0.
	if (isa<ConstantPointerNull>(Op1))
	Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));

	// We have two options: compare with register or immediate. If the RHS of
	// the compare is an immediate that we can fold into this compare, use
	// CMPri, otherwise use CMPrr.
	if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
	if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc))
	.addReg(Op0Reg)
	.addImm(Op1C->getSExtValue());
	return true;
	}
	}

	unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
	if (CompareOpc == 0) return false;

	unsigned Op1Reg = getRegForValue(Op1);
	if (Op1Reg == 0) return false;
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
	.addReg(Op0Reg)
	.addReg(Op1Reg);

	return true;
	}

	bool X86FastISel::X86SelectCmp(const Instruction *I) {
	const CmpInst *CI = cast<CmpInst>(I);

	MVT VT;
	if (!isTypeLegal(I->getOperand(0)->getType(), VT))
	return false;

	// Try to optimize or fold the cmp.
	CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
	unsigned ResultReg = 0;
	switch (Predicate) {
	default: break;
	case CmpInst::FCMP_FALSE: {
	ResultReg = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
	ResultReg);
	ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /Kill=/true,
	X86::sub_8bit);
	if (!ResultReg)
	return false;
	break;
	}
	case CmpInst::FCMP_TRUE: {
	ResultReg = createResultReg(&X86::GR8RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
	ResultReg).addImm(1);
	break;
	}
	}

	if (ResultReg) {
	updateValueMap(I, ResultReg);
	return true;
	}

	const Value *LHS = CI->getOperand(0);
	const Value *RHS = CI->getOperand(1);

	// The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
	// We don't have to materialize a zero constant for this case and can just use
	// %x again on the RHS.
	if (Predicate == CmpInst::FCMP_ORD \|\| Predicate == CmpInst::FCMP_UNO) {
	const auto *RHSC = dyn_cast<ConstantFP>(RHS);
	if (RHSC && RHSC->isNullValue())
	RHS = LHS;
	}

	// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
	static const uint16_t SETFOpcTable[2][3] = {
	{ X86::SETEr, X86::SETNPr, X86::AND8rr },
	{ X86::SETNEr, X86::SETPr, X86::OR8rr }
	};
	const uint16_t *SETFOpc = nullptr;
	switch (Predicate) {
	default: break;
	case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
	case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
	}

	ResultReg = createResultReg(&X86::GR8RegClass);
	if (SETFOpc) {
	if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
	return false;

	unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
	unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
	FlagReg1);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
	FlagReg2);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
	ResultReg).addReg(FlagReg1).addReg(FlagReg2);
	updateValueMap(I, ResultReg);
	return true;
	}

	X86::CondCode CC;
	bool SwapArgs;
	std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
	assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
	unsigned Opc = X86::getSETFromCond(CC);

	if (SwapArgs)
	std::swap(LHS, RHS);

	// Emit a compare of LHS/RHS.
	if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
	return false;

	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86SelectZExt(const Instruction *I) {
	EVT DstVT = TLI.getValueType(DL, I->getType());
	if (!TLI.isTypeLegal(DstVT))
	return false;

	unsigned ResultReg = getRegForValue(I->getOperand(0));
	if (ResultReg == 0)
	return false;

	// Handle zero-extension from i1 to i8, which is common.
	MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
	if (SrcVT == MVT::i1) {
	// Set the high bits to zero.
	ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /TODO: Kill=/false);
	SrcVT = MVT::i8;

	if (ResultReg == 0)
	return false;
	}

	if (DstVT == MVT::i64) {
	// Handle extension to 64-bits via sub-register shenanigans.
	unsigned MovInst;

	switch (SrcVT.SimpleTy) {
	case MVT::i8: MovInst = X86::MOVZX32rr8; break;
	case MVT::i16: MovInst = X86::MOVZX32rr16; break;
	case MVT::i32: MovInst = X86::MOV32rr; break;
	default: llvm_unreachable("Unexpected zext to i64 source type");
	}

	unsigned Result32 = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
	.addReg(ResultReg);

	ResultReg = createResultReg(&X86::GR64RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
	ResultReg)
	.addImm(0).addReg(Result32).addImm(X86::sub_32bit);
	} else if (DstVT == MVT::i16) {
	// i8->i16 doesn't exist in the autogenerated isel table. Need to zero
	// extend to 32-bits and then extract down to 16-bits.
	unsigned Result32 = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8),
	Result32).addReg(ResultReg);

	ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /Kill=/true,
	X86::sub_16bit);
	} else if (DstVT != MVT::i8) {
	ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
	ResultReg, /Kill=/true);
	if (ResultReg == 0)
	return false;
	}

	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86SelectSExt(const Instruction *I) {
	EVT DstVT = TLI.getValueType(DL, I->getType());
	if (!TLI.isTypeLegal(DstVT))
	return false;

	unsigned ResultReg = getRegForValue(I->getOperand(0));
	if (ResultReg == 0)
	return false;

	// Handle sign-extension from i1 to i8.
	MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
	if (SrcVT == MVT::i1) {
	// Set the high bits to zero.
	unsigned ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg,
	/TODO: Kill=/false);
	if (ZExtReg == 0)
	return false;

	// Negate the result to make an 8-bit sign extended value.
	ResultReg = createResultReg(&X86::GR8RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::NEG8r),
	ResultReg).addReg(ZExtReg);

	SrcVT = MVT::i8;
	}

	if (DstVT == MVT::i16) {
	// i8->i16 doesn't exist in the autogenerated isel table. Need to sign
	// extend to 32-bits and then extract down to 16-bits.
	unsigned Result32 = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8),
	Result32).addReg(ResultReg);

	ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /Kill=/true,
	X86::sub_16bit);
	} else if (DstVT != MVT::i8) {
	ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND,
	ResultReg, /Kill=/true);
	if (ResultReg == 0)
	return false;
	}

	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86SelectBranch(const Instruction *I) {
	// Unconditional branches are selected by tablegen-generated code.
	// Handle a conditional branch.
	const BranchInst *BI = cast<BranchInst>(I);
	MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
	MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];

	// Fold the common case of a conditional branch with a comparison
	// in the same block (values defined on other blocks may not have
	// initialized registers).
	X86::CondCode CC;
	if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
	if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
	EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());

	// Try to optimize or fold the cmp.
	CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
	switch (Predicate) {
	default: break;
	case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true;
	case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true;
	}

	const Value *CmpLHS = CI->getOperand(0);
	const Value *CmpRHS = CI->getOperand(1);

	// The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
	// 0.0.
	// We don't have to materialize a zero constant for this case and can just
	// use %x again on the RHS.
	if (Predicate == CmpInst::FCMP_ORD \|\| Predicate == CmpInst::FCMP_UNO) {
	const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
	if (CmpRHSC && CmpRHSC->isNullValue())
	CmpRHS = CmpLHS;
	}

	// Try to take advantage of fallthrough opportunities.
	if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
	std::swap(TrueMBB, FalseMBB);
	Predicate = CmpInst::getInversePredicate(Predicate);
	}

	// FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
	// code check. Instead two branch instructions are required to check all
	// the flags. First we change the predicate to a supported condition code,
	// which will be the first branch. Later one we will emit the second
	// branch.
	bool NeedExtraBranch = false;
	switch (Predicate) {
	default: break;
	case CmpInst::FCMP_OEQ:
	std::swap(TrueMBB, FalseMBB);
	LLVM_FALLTHROUGH;
	case CmpInst::FCMP_UNE:
	NeedExtraBranch = true;
	Predicate = CmpInst::FCMP_ONE;
	break;
	}

	bool SwapArgs;
	unsigned BranchOpc;
	std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
	assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");

	BranchOpc = X86::GetCondBranchFromCond(CC);
	if (SwapArgs)
	std::swap(CmpLHS, CmpRHS);

	// Emit a compare of the LHS and RHS, setting the flags.
	if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
	return false;

	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
	.addMBB(TrueMBB);

	// X86 requires a second branch to handle UNE (and OEQ, which is mapped
	// to UNE above).
	if (NeedExtraBranch) {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1))
	.addMBB(TrueMBB);
	}

	finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
	return true;
	}
	} else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
	// Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
	// typically happen for _Bool and C++ bools.
	MVT SourceVT;
	if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
	isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
	unsigned TestOpc = 0;
	switch (SourceVT.SimpleTy) {
	default: break;
	case MVT::i8: TestOpc = X86::TEST8ri; break;
	case MVT::i16: TestOpc = X86::TEST16ri; break;
	case MVT::i32: TestOpc = X86::TEST32ri; break;
	case MVT::i64: TestOpc = X86::TEST64ri32; break;
	}
	if (TestOpc) {
	unsigned OpReg = getRegForValue(TI->getOperand(0));
	if (OpReg == 0) return false;

	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
	.addReg(OpReg).addImm(1);

	unsigned JmpOpc = X86::JNE_1;
	if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
	std::swap(TrueMBB, FalseMBB);
	JmpOpc = X86::JE_1;
	}

	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
	.addMBB(TrueMBB);

	finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
	return true;
	}
	}
	} else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
	// Fake request the condition, otherwise the intrinsic might be completely
	// optimized away.
	unsigned TmpReg = getRegForValue(BI->getCondition());
	if (TmpReg == 0)
	return false;

	unsigned BranchOpc = X86::GetCondBranchFromCond(CC);

	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
	.addMBB(TrueMBB);
	finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
	return true;
	}

	// Otherwise do a clumsy setcc and re-test it.
	// Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
	// in an explicit cast, so make sure to handle that correctly.
	unsigned OpReg = getRegForValue(BI->getCondition());
	if (OpReg == 0) return false;

	// In case OpReg is a K register, COPY to a GPR
	if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) {
	unsigned KOpReg = OpReg;
	OpReg = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), OpReg)
	.addReg(KOpReg);
	OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /Kill=/true,
	X86::sub_8bit);
	}
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
	.addReg(OpReg)
	.addImm(1);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
	.addMBB(TrueMBB);
	finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
	return true;
	}

	bool X86FastISel::X86SelectShift(const Instruction *I) {
	unsigned CReg = 0, OpReg = 0;
	const TargetRegisterClass *RC = nullptr;
	assert(!I->getType()->isIntegerTy(8) &&
	"i8 shifts should be handled by autogenerated table");
	if (I->getType()->isIntegerTy(16)) {
	CReg = X86::CX;
	RC = &X86::GR16RegClass;
	switch (I->getOpcode()) {
	default: llvm_unreachable("Unexpected shift opcode");
	case Instruction::LShr: OpReg = X86::SHR16rCL; break;
	case Instruction::AShr: OpReg = X86::SAR16rCL; break;
	case Instruction::Shl: OpReg = X86::SHL16rCL; break;
	}
	} else if (I->getType()->isIntegerTy(32)) {
	CReg = X86::ECX;
	RC = &X86::GR32RegClass;
	switch (I->getOpcode()) {
	default: llvm_unreachable("Unexpected shift opcode");
	case Instruction::LShr: OpReg = X86::SHR32rCL; break;
	case Instruction::AShr: OpReg = X86::SAR32rCL; break;
	case Instruction::Shl: OpReg = X86::SHL32rCL; break;
	}
	} else if (I->getType()->isIntegerTy(64)) {
	CReg = X86::RCX;
	RC = &X86::GR64RegClass;
	switch (I->getOpcode()) {
	default: llvm_unreachable("Unexpected shift opcode");
	case Instruction::LShr: OpReg = X86::SHR64rCL; break;
	case Instruction::AShr: OpReg = X86::SAR64rCL; break;
	case Instruction::Shl: OpReg = X86::SHL64rCL; break;
	}
	} else {
	return false;
	}

	MVT VT;
	if (!isTypeLegal(I->getType(), VT))
	return false;

	unsigned Op0Reg = getRegForValue(I->getOperand(0));
	if (Op0Reg == 0) return false;

	unsigned Op1Reg = getRegForValue(I->getOperand(1));
	if (Op1Reg == 0) return false;
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
	CReg).addReg(Op1Reg);

	// The shift instruction uses X86::CL. If we defined a super-register
	// of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
	assert(CReg != X86::CL && "CReg should be a super register of CL");
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::KILL), X86::CL)
	.addReg(CReg, RegState::Kill);

	unsigned ResultReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
	.addReg(Op0Reg);
	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86SelectDivRem(const Instruction *I) {
	const static unsigned NumTypes = 4; // i8, i16, i32, i64
	const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem
	const static bool S = true; // IsSigned
	const static bool U = false; // !IsSigned
	const static unsigned Copy = TargetOpcode::COPY;
	// For the X86 DIV/IDIV instruction, in most cases the dividend
	// (numerator) must be in a specific register pair highreg:lowreg,
	// producing the quotient in lowreg and the remainder in highreg.
	// For most data types, to set up the instruction, the dividend is
	// copied into lowreg, and lowreg is sign-extended or zero-extended
	// into highreg. The exception is i8, where the dividend is defined
	// as a single register rather than a register pair, and we
	// therefore directly sign-extend or zero-extend the dividend into
	// lowreg, instead of copying, and ignore the highreg.
	const static struct DivRemEntry {
	// The following portion depends only on the data type.
	const TargetRegisterClass *RC;
	unsigned LowInReg; // low part of the register pair
	unsigned HighInReg; // high part of the register pair
	// The following portion depends on both the data type and the operation.
	struct DivRemResult {
	unsigned OpDivRem; // The specific DIV/IDIV opcode to use.
	unsigned OpSignExtend; // Opcode for sign-extending lowreg into
	// highreg, or copying a zero into highreg.
	unsigned OpCopy; // Opcode for copying dividend into lowreg, or
	// zero/sign-extending into lowreg for i8.
	unsigned DivRemResultReg; // Register containing the desired result.
	bool IsOpSigned; // Whether to use signed or unsigned form.
	} ResultTable[NumOps];
	} OpTable[NumTypes] = {
	{ &X86::GR8RegClass, X86::AX, 0, {
	{ X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv
	{ X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem
	{ X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv
	{ X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem
	}
	}, // i8
	{ &X86::GR16RegClass, X86::AX, X86::DX, {
	{ X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv
	{ X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem
	{ X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv
	{ X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem
	}
	}, // i16
	{ &X86::GR32RegClass, X86::EAX, X86::EDX, {
	{ X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv
	{ X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem
	{ X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv
	{ X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem
	}
	}, // i32
	{ &X86::GR64RegClass, X86::RAX, X86::RDX, {
	{ X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv
	{ X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem
	{ X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv
	{ X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem
	}
	}, // i64
	};

	MVT VT;
	if (!isTypeLegal(I->getType(), VT))
	return false;

	unsigned TypeIndex, OpIndex;
	switch (VT.SimpleTy) {
	default: return false;
	case MVT::i8: TypeIndex = 0; break;
	case MVT::i16: TypeIndex = 1; break;
	case MVT::i32: TypeIndex = 2; break;
	case MVT::i64: TypeIndex = 3;
	if (!Subtarget->is64Bit())
	return false;
	break;
	}

	switch (I->getOpcode()) {
	default: llvm_unreachable("Unexpected div/rem opcode");
	case Instruction::SDiv: OpIndex = 0; break;
	case Instruction::SRem: OpIndex = 1; break;
	case Instruction::UDiv: OpIndex = 2; break;
	case Instruction::URem: OpIndex = 3; break;
	}

	const DivRemEntry &TypeEntry = OpTable[TypeIndex];
	const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
	unsigned Op0Reg = getRegForValue(I->getOperand(0));
	if (Op0Reg == 0)
	return false;
	unsigned Op1Reg = getRegForValue(I->getOperand(1));
	if (Op1Reg == 0)
	return false;

	// Move op0 into low-order input register.
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
	// Zero-extend or sign-extend into high-order input register.
	if (OpEntry.OpSignExtend) {
	if (OpEntry.IsOpSigned)
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(OpEntry.OpSignExtend));
	else {
	unsigned Zero32 = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(X86::MOV32r0), Zero32);

	// Copy the zero into the appropriate sub/super/identical physical
	// register. Unfortunately the operations needed are not uniform enough
	// to fit neatly into the table above.
	if (VT == MVT::i16) {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Copy), TypeEntry.HighInReg)
	.addReg(Zero32, 0, X86::sub_16bit);
	} else if (VT == MVT::i32) {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Copy), TypeEntry.HighInReg)
	.addReg(Zero32);
	} else if (VT == MVT::i64) {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
	.addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
	}
	}
	}
	// Generate the DIV/IDIV instruction.
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
	// For i8 remainder, we can't reference ah directly, as we'll end
	// up with bogus copies like %r9b = COPY %ah. Reference ax
	// instead to prevent ah references in a rex instruction.
	//
	// The current assumption of the fast register allocator is that isel
	// won't generate explicit references to the GR8_NOREX registers. If
	// the allocator and/or the backend get enhanced to be more robust in
	// that regard, this can be, and should be, removed.
	unsigned ResultReg = 0;
	if ((I->getOpcode() == Instruction::SRem \|\|
	I->getOpcode() == Instruction::URem) &&
	OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
	unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
	unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Copy), SourceSuperReg).addReg(X86::AX);

	// Shift AX right by 8 bits instead of using AH.
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
	ResultSuperReg).addReg(SourceSuperReg).addImm(8);

	// Now reference the 8-bit subreg of the result.
	ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
	/Kill=/true, X86::sub_8bit);
	}
	// Copy the result out of the physreg if we haven't already.
	if (!ResultReg) {
	ResultReg = createResultReg(TypeEntry.RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
	.addReg(OpEntry.DivRemResultReg);
	}
	updateValueMap(I, ResultReg);

	return true;
	}

	/// \brief Emit a conditional move instruction (if the are supported) to lower
	/// the select.
	bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
	// Check if the subtarget supports these instructions.
	if (!Subtarget->hasCMov())
	return false;

	// FIXME: Add support for i8.
	if (RetVT < MVT::i16 \|\| RetVT > MVT::i64)
	return false;

	const Value *Cond = I->getOperand(0);
	const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
	bool NeedTest = true;
	X86::CondCode CC = X86::COND_NE;

	// Optimize conditions coming from a compare if both instructions are in the
	// same basic block (values defined in other basic blocks may not have
	// initialized registers).
	const auto *CI = dyn_cast<CmpInst>(Cond);
	if (CI && (CI->getParent() == I->getParent())) {
	CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);

	// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
	static const uint16_t SETFOpcTable[2][3] = {
	{ X86::SETNPr, X86::SETEr , X86::TEST8rr },
	{ X86::SETPr, X86::SETNEr, X86::OR8rr }
	};
	const uint16_t *SETFOpc = nullptr;
	switch (Predicate) {
	default: break;
	case CmpInst::FCMP_OEQ:
	SETFOpc = &SETFOpcTable[0][0];
	Predicate = CmpInst::ICMP_NE;
	break;
	case CmpInst::FCMP_UNE:
	SETFOpc = &SETFOpcTable[1][0];
	Predicate = CmpInst::ICMP_NE;
	break;
	}

	bool NeedSwap;
	std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate);
	assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");

	const Value *CmpLHS = CI->getOperand(0);
	const Value *CmpRHS = CI->getOperand(1);
	if (NeedSwap)
	std::swap(CmpLHS, CmpRHS);

	EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
	// Emit a compare of the LHS and RHS, setting the flags.
	if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
	return false;

	if (SETFOpc) {
	unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
	unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
	FlagReg1);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
	FlagReg2);
	auto const &II = TII.get(SETFOpc[2]);
	if (II.getNumDefs()) {
	unsigned TmpReg = createResultReg(&X86::GR8RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
	.addReg(FlagReg2).addReg(FlagReg1);
	} else {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
	.addReg(FlagReg2).addReg(FlagReg1);
	}
	}
	NeedTest = false;
	} else if (foldX86XALUIntrinsic(CC, I, Cond)) {
	// Fake request the condition, otherwise the intrinsic might be completely
	// optimized away.
	unsigned TmpReg = getRegForValue(Cond);
	if (TmpReg == 0)
	return false;

	NeedTest = false;
	}

	if (NeedTest) {
	// Selects operate on i1, however, CondReg is 8 bits width and may contain
	// garbage. Indeed, only the less significant bit is supposed to be
	// accurate. If we read more than the lsb, we may see non-zero values
	// whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
	// the select. This is achieved by performing TEST against 1.
	unsigned CondReg = getRegForValue(Cond);
	if (CondReg == 0)
	return false;
	bool CondIsKill = hasTrivialKill(Cond);

	// In case OpReg is a K register, COPY to a GPR
	if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
	unsigned KCondReg = CondReg;
	CondReg = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), CondReg)
	.addReg(KCondReg, getKillRegState(CondIsKill));
	CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /Kill=/true,
	X86::sub_8bit);
	}
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
	.addReg(CondReg, getKillRegState(CondIsKill))
	.addImm(1);
	}

	const Value *LHS = I->getOperand(1);
	const Value *RHS = I->getOperand(2);

	unsigned RHSReg = getRegForValue(RHS);
	bool RHSIsKill = hasTrivialKill(RHS);

	unsigned LHSReg = getRegForValue(LHS);
	bool LHSIsKill = hasTrivialKill(LHS);

	if (!LHSReg \|\| !RHSReg)
	return false;

	const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
	unsigned Opc = X86::getCMovFromCond(CC, TRI.getRegSizeInBits(*RC)/8);
	unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
	LHSReg, LHSIsKill);
	updateValueMap(I, ResultReg);
	return true;
	}

	/// \brief Emit SSE or AVX instructions to lower the select.
	///
	/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
	/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
	/// SSE instructions are available. If AVX is available, try to use a VBLENDV.
	bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
	// Optimize conditions coming from a compare if both instructions are in the
	// same basic block (values defined in other basic blocks may not have
	// initialized registers).
	const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
	if (!CI \|\| (CI->getParent() != I->getParent()))
	return false;

	if (I->getType() != CI->getOperand(0)->getType() \|\|
	!((Subtarget->hasSSE1() && RetVT == MVT::f32) \|\|
	(Subtarget->hasSSE2() && RetVT == MVT::f64)))
	return false;

	const Value *CmpLHS = CI->getOperand(0);
	const Value *CmpRHS = CI->getOperand(1);
	CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);

	// The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
	// We don't have to materialize a zero constant for this case and can just use
	// %x again on the RHS.
	if (Predicate == CmpInst::FCMP_ORD \|\| Predicate == CmpInst::FCMP_UNO) {
	const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
	if (CmpRHSC && CmpRHSC->isNullValue())
	CmpRHS = CmpLHS;
	}

	unsigned CC;
	bool NeedSwap;
	std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
	if (CC > 7 && !Subtarget->hasAVX())
	return false;

	if (NeedSwap)
	std::swap(CmpLHS, CmpRHS);

	// Choose the SSE instruction sequence based on data type (float or double).
	static const uint16_t OpcTable[2][4] = {
	{ X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr },
	{ X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr }
	};

	const uint16_t *Opc = nullptr;
	switch (RetVT.SimpleTy) {
	default: return false;
	case MVT::f32: Opc = &OpcTable[0][0]; break;
	case MVT::f64: Opc = &OpcTable[1][0]; break;
	}

	const Value *LHS = I->getOperand(1);
	const Value *RHS = I->getOperand(2);

	unsigned LHSReg = getRegForValue(LHS);
	bool LHSIsKill = hasTrivialKill(LHS);

	unsigned RHSReg = getRegForValue(RHS);
	bool RHSIsKill = hasTrivialKill(RHS);

	unsigned CmpLHSReg = getRegForValue(CmpLHS);
	bool CmpLHSIsKill = hasTrivialKill(CmpLHS);

	unsigned CmpRHSReg = getRegForValue(CmpRHS);
	bool CmpRHSIsKill = hasTrivialKill(CmpRHS);

	if (!LHSReg \|\| !RHSReg \|\| !CmpLHS \|\| !CmpRHS)
	return false;

	const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
	unsigned ResultReg;

	if (Subtarget->hasAVX512()) {
	// If we have AVX512 we can use a mask compare and masked movss/sd.
	const TargetRegisterClass *VR128X = &X86::VR128XRegClass;
	const TargetRegisterClass *VK1 = &X86::VK1RegClass;

	unsigned CmpOpcode =
	(RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr;
	unsigned CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill,
	CmpRHSReg, CmpRHSIsKill, CC);

	// Need an IMPLICIT_DEF for the input that is used to generate the upper
	// bits of the result register since its not based on any of the inputs.
	unsigned ImplicitDefReg = createResultReg(VR128X);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);

	// Place RHSReg is the passthru of the masked movss/sd operation and put
	// LHS in the input. The mask input comes from the compare.
	unsigned MovOpcode =
	(RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk;
	unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, RHSIsKill,
	CmpReg, true, ImplicitDefReg, true,
	LHSReg, LHSIsKill);

	ResultReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg);

	} else if (Subtarget->hasAVX()) {
	const TargetRegisterClass *VR128 = &X86::VR128RegClass;

	// If we have AVX, create 1 blendv instead of 3 logic instructions.
	// Blendv was introduced with SSE 4.1, but the 2 register form implicitly
	// uses XMM0 as the selection register. That may need just as many
	// instructions as the AND/ANDN/OR sequence due to register moves, so
	// don't bother.
	unsigned CmpOpcode =
	(RetVT == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr;
	unsigned BlendOpcode =
	(RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;

	unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
	CmpRHSReg, CmpRHSIsKill, CC);
	unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
	LHSReg, LHSIsKill, CmpReg, true);
	ResultReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
	} else {
	const TargetRegisterClass *VR128 = &X86::VR128RegClass;
	unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
	CmpRHSReg, CmpRHSIsKill, CC);
	unsigned AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /IsKill=/false,
	LHSReg, LHSIsKill);
	unsigned AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /IsKill=/true,
	RHSReg, RHSIsKill);
	unsigned OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /IsKill=/true,
	AndReg, /IsKill=/true);
	ResultReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
	}
	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
	// These are pseudo CMOV instructions and will be later expanded into control-
	// flow.
	unsigned Opc;
	switch (RetVT.SimpleTy) {
	default: return false;
	case MVT::i8: Opc = X86::CMOV_GR8; break;
	case MVT::i16: Opc = X86::CMOV_GR16; break;
	case MVT::i32: Opc = X86::CMOV_GR32; break;
	case MVT::f32: Opc = X86::CMOV_FR32; break;
	case MVT::f64: Opc = X86::CMOV_FR64; break;
	}

	const Value *Cond = I->getOperand(0);
	X86::CondCode CC = X86::COND_NE;

	// Optimize conditions coming from a compare if both instructions are in the
	// same basic block (values defined in other basic blocks may not have
	// initialized registers).
	const auto *CI = dyn_cast<CmpInst>(Cond);
	if (CI && (CI->getParent() == I->getParent())) {
	bool NeedSwap;
	std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate());
	if (CC > X86::LAST_VALID_COND)
	return false;

	const Value *CmpLHS = CI->getOperand(0);
	const Value *CmpRHS = CI->getOperand(1);

	if (NeedSwap)
	std::swap(CmpLHS, CmpRHS);

	EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
	if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
	return false;
	} else {
	unsigned CondReg = getRegForValue(Cond);
	if (CondReg == 0)
	return false;
	bool CondIsKill = hasTrivialKill(Cond);

	// In case OpReg is a K register, COPY to a GPR
	if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
	unsigned KCondReg = CondReg;
	CondReg = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), CondReg)
	.addReg(KCondReg, getKillRegState(CondIsKill));
	CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /Kill=/true,
	X86::sub_8bit);
	}
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
	.addReg(CondReg, getKillRegState(CondIsKill))
	.addImm(1);
	}

	const Value *LHS = I->getOperand(1);
	const Value *RHS = I->getOperand(2);

	unsigned LHSReg = getRegForValue(LHS);
	bool LHSIsKill = hasTrivialKill(LHS);

	unsigned RHSReg = getRegForValue(RHS);
	bool RHSIsKill = hasTrivialKill(RHS);

	if (!LHSReg \|\| !RHSReg)
	return false;

	const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);

	unsigned ResultReg =
	fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86SelectSelect(const Instruction *I) {
	MVT RetVT;
	if (!isTypeLegal(I->getType(), RetVT))
	return false;

	// Check if we can fold the select.
	if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
	CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
	const Value *Opnd = nullptr;
	switch (Predicate) {
	default: break;
	case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
	case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break;
	}
	// No need for a select anymore - this is an unconditional move.
	if (Opnd) {
	unsigned OpReg = getRegForValue(Opnd);
	if (OpReg == 0)
	return false;
	bool OpIsKill = hasTrivialKill(Opnd);
	const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
	unsigned ResultReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg)
	.addReg(OpReg, getKillRegState(OpIsKill));
	updateValueMap(I, ResultReg);
	return true;
	}
	}

	// First try to use real conditional move instructions.
	if (X86FastEmitCMoveSelect(RetVT, I))
	return true;

	// Try to use a sequence of SSE instructions to simulate a conditional move.
	if (X86FastEmitSSESelect(RetVT, I))
	return true;

	// Fall-back to pseudo conditional move instructions, which will be later
	// converted to control-flow.
	if (X86FastEmitPseudoSelect(RetVT, I))
	return true;

	return false;
	}

	bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
	// The target-independent selection algorithm in FastISel already knows how
	// to select a SINT_TO_FP if the target is SSE but not AVX.
	// Early exit if the subtarget doesn't have AVX.
	if (!Subtarget->hasAVX())
	return false;

	Type *InTy = I->getOperand(0)->getType();
	if (!InTy->isIntegerTy(32) && !InTy->isIntegerTy(64))
	return false;

	// Select integer to float/double conversion.
	unsigned OpReg = getRegForValue(I->getOperand(0));
	if (OpReg == 0)
	return false;

	const TargetRegisterClass *RC = nullptr;
	unsigned Opcode;

	if (I->getType()->isDoubleTy()) {
	// sitofp int -> double
	Opcode = InTy->isIntegerTy(64) ? X86::VCVTSI642SDrr : X86::VCVTSI2SDrr;
	RC = &X86::FR64RegClass;
	} else if (I->getType()->isFloatTy()) {
	// sitofp int -> float
	Opcode = InTy->isIntegerTy(64) ? X86::VCVTSI642SSrr : X86::VCVTSI2SSrr;
	RC = &X86::FR32RegClass;
	} else
	return false;

	unsigned ImplicitDefReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
	unsigned ResultReg =
	fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false);
	updateValueMap(I, ResultReg);
	return true;
	}

	// Helper method used by X86SelectFPExt and X86SelectFPTrunc.
	bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
	unsigned TargetOpc,
	const TargetRegisterClass *RC) {
	assert((I->getOpcode() == Instruction::FPExt \|\|
	I->getOpcode() == Instruction::FPTrunc) &&
	"Instruction must be an FPExt or FPTrunc!");

	unsigned OpReg = getRegForValue(I->getOperand(0));
	if (OpReg == 0)
	return false;

	unsigned ImplicitDefReg;
	if (Subtarget->hasAVX()) {
	ImplicitDefReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);

	}

	unsigned ResultReg = createResultReg(RC);
	MachineInstrBuilder MIB;
	MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
	ResultReg);

	if (Subtarget->hasAVX())
	MIB.addReg(ImplicitDefReg);

	MIB.addReg(OpReg);
	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::X86SelectFPExt(const Instruction *I) {
	if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
	I->getOperand(0)->getType()->isFloatTy()) {
	bool HasAVX512 = Subtarget->hasAVX512();
	// fpext from float to double.
	unsigned Opc =
	HasAVX512 ? X86::VCVTSS2SDZrr
	: Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
	return X86SelectFPExtOrFPTrunc(
	I, Opc, HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass);
	}

	return false;
	}

	bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
	if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
	I->getOperand(0)->getType()->isDoubleTy()) {
	bool HasAVX512 = Subtarget->hasAVX512();
	// fptrunc from double to float.
	unsigned Opc =
	HasAVX512 ? X86::VCVTSD2SSZrr
	: Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
	return X86SelectFPExtOrFPTrunc(
	I, Opc, HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass);
	}

	return false;
	}

	bool X86FastISel::X86SelectTrunc(const Instruction *I) {
	EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
	EVT DstVT = TLI.getValueType(DL, I->getType());

	// This code only handles truncation to byte.
	if (DstVT != MVT::i8 && DstVT != MVT::i1)
	return false;
	if (!TLI.isTypeLegal(SrcVT))
	return false;

	unsigned InputReg = getRegForValue(I->getOperand(0));
	if (!InputReg)
	// Unhandled operand. Halt "fast" selection and bail.
	return false;

	if (SrcVT == MVT::i8) {
	// Truncate from i8 to i1; no code needed.
	updateValueMap(I, InputReg);
	return true;
	}

	// Issue an extract_subreg.
	unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
	InputReg, false,
	X86::sub_8bit);
	if (!ResultReg)
	return false;

	updateValueMap(I, ResultReg);
	return true;
	}

	bool X86FastISel::IsMemcpySmall(uint64_t Len) {
	return Len <= (Subtarget->is64Bit() ? 32 : 16);
	}

	bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
	X86AddressMode SrcAM, uint64_t Len) {

	// Make sure we don't bloat code by inlining very large memcpy's.
	if (!IsMemcpySmall(Len))
	return false;

	bool i64Legal = Subtarget->is64Bit();

	// We don't care about alignment here since we just emit integer accesses.
	while (Len) {
	MVT VT;
	if (Len >= 8 && i64Legal)
	VT = MVT::i64;
	else if (Len >= 4)
	VT = MVT::i32;
	else if (Len >= 2)
	VT = MVT::i16;
	else
	VT = MVT::i8;

	unsigned Reg;
	bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
	RV &= X86FastEmitStore(VT, Reg, /Kill=/true, DestAM);
	assert(RV && "Failed to emit load or store??");

	unsigned Size = VT.getSizeInBits()/8;
	Len -= Size;
	DestAM.Disp += Size;
	SrcAM.Disp += Size;
	}

	return true;
	}

	bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
	// FIXME: Handle more intrinsics.
	switch (II->getIntrinsicID()) {
	default: return false;
	case Intrinsic::convert_from_fp16:
	case Intrinsic::convert_to_fp16: {
	if (Subtarget->useSoftFloat() \|\| !Subtarget->hasF16C())
	return false;

	const Value *Op = II->getArgOperand(0);
	unsigned InputReg = getRegForValue(Op);
	if (InputReg == 0)
	return false;

	// F16C only allows converting from float to half and from half to float.
	bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16;
	if (IsFloatToHalf) {
	if (!Op->getType()->isFloatTy())
	return false;
	} else {
	if (!II->getType()->isFloatTy())
	return false;
	}

	unsigned ResultReg = 0;
	const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16);
	if (IsFloatToHalf) {
	// 'InputReg' is implicitly promoted from register class FR32 to
	// register class VR128 by method 'constrainOperandRegClass' which is
	// directly called by 'fastEmitInst_ri'.
	// Instruction VCVTPS2PHrr takes an extra immediate operand which is
	// used to provide rounding control: use MXCSR.RC, encoded as 0b100.
	// It's consistent with the other FP instructions, which are usually
	// controlled by MXCSR.
	InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 4);

	// Move the lower 32-bits of ResultReg to another register of class GR32.
	ResultReg = createResultReg(&X86::GR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(X86::VMOVPDI2DIrr), ResultReg)
	.addReg(InputReg, RegState::Kill);

	// The result value is in the lower 16-bits of ResultReg.
	unsigned RegIdx = X86::sub_16bit;
	ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx);
	} else {
	assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
	// Explicitly sign-extend the input to 32-bit.
	InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg,
	/Kill=/false);

	// The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
	InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
	InputReg, /Kill=/true);

	InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /Kill=/true);

	// The result value is in the lower 32-bits of ResultReg.
	// Emit an explicit copy from register class VR128 to register class FR32.
	ResultReg = createResultReg(&X86::FR32RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg)
	.addReg(InputReg, RegState::Kill);
	}

	updateValueMap(II, ResultReg);
	return true;
	}
	case Intrinsic::frameaddress: {
	MachineFunction *MF = FuncInfo.MF;
	if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI())
	return false;

	Type *RetTy = II->getCalledFunction()->getReturnType();

	MVT VT;
	if (!isTypeLegal(RetTy, VT))
	return false;

	unsigned Opc;
	const TargetRegisterClass *RC = nullptr;

	switch (VT.SimpleTy) {
	default: llvm_unreachable("Invalid result type for frameaddress.");
	case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
	case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
	}

	// This needs to be set before we call getPtrSizedFrameRegister, otherwise
	// we get the wrong frame register.
	MachineFrameInfo &MFI = MF->getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
	unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);
	assert(((FrameReg == X86::RBP && VT == MVT::i64) \|\|
	(FrameReg == X86::EBP && VT == MVT::i32)) &&
	"Invalid Frame Register!");

	// Always make a copy of the frame register to to a vreg first, so that we
	// never directly reference the frame register (the TwoAddressInstruction-
	// Pass doesn't like that).
	unsigned SrcReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);

	// Now recursively load from the frame address.
	// movq (%rbp), %rax
	// movq (%rax), %rax
	// movq (%rax), %rax
	// ...
	unsigned DestReg;
	unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
	while (Depth--) {
	DestReg = createResultReg(RC);
	addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc), DestReg), SrcReg);
	SrcReg = DestReg;
	}

	updateValueMap(II, SrcReg);
	return true;
	}
	case Intrinsic::memcpy: {
	const MemCpyInst *MCI = cast<MemCpyInst>(II);
	// Don't handle volatile or variable length memcpys.
	if (MCI->isVolatile())
	return false;

	if (isa<ConstantInt>(MCI->getLength())) {
	// Small memcpy's are common enough that we want to do them
	// without a call if possible.
	uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
	if (IsMemcpySmall(Len)) {
	X86AddressMode DestAM, SrcAM;
	if (!X86SelectAddress(MCI->getRawDest(), DestAM) \|\|
	!X86SelectAddress(MCI->getRawSource(), SrcAM))
	return false;
	TryEmitSmallMemcpy(DestAM, SrcAM, Len);
	return true;
	}
	}

	unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
	if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
	return false;

	if (MCI->getSourceAddressSpace() > 255 \|\| MCI->getDestAddressSpace() > 255)
	return false;

	return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
	}
	case Intrinsic::memset: {
	const MemSetInst *MSI = cast<MemSetInst>(II);

	if (MSI->isVolatile())
	return false;

	unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
	if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
	return false;

	if (MSI->getDestAddressSpace() > 255)
	return false;

	return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
	}
	case Intrinsic::stackprotector: {
	// Emit code to store the stack guard onto the stack.
	EVT PtrTy = TLI.getPointerTy(DL);

	const Value *Op1 = II->getArgOperand(0); // The guard's value.
	const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));

	MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);

	// Grab the frame index.
	X86AddressMode AM;
	if (!X86SelectAddress(Slot, AM)) return false;
	if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
	return true;
	}
	case Intrinsic::dbg_declare: {
	const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
	X86AddressMode AM;
	assert(DI->getAddress() && "Null address should be checked earlier!");
	if (!X86SelectAddress(DI->getAddress(), AM))
	return false;
	const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
	// FIXME may need to add RegState::Debug to any registers produced,
	// although ESP/EBP should be the only ones at the moment.
	assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) &&
	"Expected inlined-at fields to agree");
	addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
	.addImm(0)
	.addMetadata(DI->getVariable())
	.addMetadata(DI->getExpression());
	return true;
	}
	case Intrinsic::trap: {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
	return true;
	}
	case Intrinsic::sqrt: {
	if (!Subtarget->hasSSE1())
	return false;

	Type *RetTy = II->getCalledFunction()->getReturnType();

	MVT VT;
	if (!isTypeLegal(RetTy, VT))
	return false;

	// Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
	// is not generated by FastISel yet.
	// FIXME: Update this code once tablegen can handle it.
	static const uint16_t SqrtOpc[2][2] = {
	{X86::SQRTSSr, X86::VSQRTSSr},
	{X86::SQRTSDr, X86::VSQRTSDr}
	};
	bool HasAVX = Subtarget->hasAVX();
	unsigned Opc;
	const TargetRegisterClass *RC;
	switch (VT.SimpleTy) {
	default: return false;
	case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
	case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
	}

	const Value *SrcVal = II->getArgOperand(0);
	unsigned SrcReg = getRegForValue(SrcVal);

	if (SrcReg == 0)
	return false;

	unsigned ImplicitDefReg = 0;
	if (HasAVX) {
	ImplicitDefReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
	}

	unsigned ResultReg = createResultReg(RC);
	MachineInstrBuilder MIB;
	MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
	ResultReg);

	if (ImplicitDefReg)
	MIB.addReg(ImplicitDefReg);

	MIB.addReg(SrcReg);

	updateValueMap(II, ResultReg);
	return true;
	}
	case Intrinsic::sadd_with_overflow:
	case Intrinsic::uadd_with_overflow:
	case Intrinsic::ssub_with_overflow:
	case Intrinsic::usub_with_overflow:
	case Intrinsic::smul_with_overflow:
	case Intrinsic::umul_with_overflow: {
	// This implements the basic lowering of the xalu with overflow intrinsics
	// into add/sub/mul followed by either seto or setb.
	const Function *Callee = II->getCalledFunction();
	auto *Ty = cast<StructType>(Callee->getReturnType());
	Type *RetTy = Ty->getTypeAtIndex(0U);
	assert(Ty->getTypeAtIndex(1)->isIntegerTy() &&
	Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 &&
	"Overflow value expected to be an i1");

	MVT VT;
	if (!isTypeLegal(RetTy, VT))
	return false;

	if (VT < MVT::i8 \|\| VT > MVT::i64)
	return false;

	const Value *LHS = II->getArgOperand(0);
	const Value *RHS = II->getArgOperand(1);

	// Canonicalize immediate to the RHS.
	if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
	isCommutativeIntrinsic(II))
	std::swap(LHS, RHS);

	bool UseIncDec = false;
	if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne())
	UseIncDec = true;

	unsigned BaseOpc, CondOpc;
	switch (II->getIntrinsicID()) {
	default: llvm_unreachable("Unexpected intrinsic!");
	case Intrinsic::sadd_with_overflow:
	BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD);
	CondOpc = X86::SETOr;
	break;
	case Intrinsic::uadd_with_overflow:
	BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
	case Intrinsic::ssub_with_overflow:
	BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB);
	CondOpc = X86::SETOr;
	break;
	case Intrinsic::usub_with_overflow:
	BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
	case Intrinsic::smul_with_overflow:
	BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
	case Intrinsic::umul_with_overflow:
	BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
	}

	unsigned LHSReg = getRegForValue(LHS);
	if (LHSReg == 0)
	return false;
	bool LHSIsKill = hasTrivialKill(LHS);

	unsigned ResultReg = 0;
	// Check if we have an immediate version.
	if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
	static const uint16_t Opc[2][4] = {
	{ X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
	{ X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
	};

	if (BaseOpc == X86ISD::INC \|\| BaseOpc == X86ISD::DEC) {
	ResultReg = createResultReg(TLI.getRegClassFor(VT));
	bool IsDec = BaseOpc == X86ISD::DEC;
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
	.addReg(LHSReg, getKillRegState(LHSIsKill));
	} else
	ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
	CI->getZExtValue());
	}

	unsigned RHSReg;
	bool RHSIsKill;
	if (!ResultReg) {
	RHSReg = getRegForValue(RHS);
	if (RHSReg == 0)
	return false;
	RHSIsKill = hasTrivialKill(RHS);
	ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
	RHSIsKill);
	}

	// FastISel doesn't have a pattern for all X86::MULr and X86::IMULr. Emit
	// it manually.
	if (BaseOpc == X86ISD::UMUL && !ResultReg) {
	static const uint16_t MULOpc[] =
	{ X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
	static const MCPhysReg Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
	// First copy the first operand into RAX, which is an implicit input to
	// the X86::MUL*r instruction.
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
	.addReg(LHSReg, getKillRegState(LHSIsKill));
	ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
	TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
	} else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
	static const uint16_t MULOpc[] =
	{ X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
	if (VT == MVT::i8) {
	// Copy the first operand into AL, which is an implicit input to the
	// X86::IMUL8r instruction.
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), X86::AL)
	.addReg(LHSReg, getKillRegState(LHSIsKill));
	ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
	RHSIsKill);
	} else
	ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
	TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
	RHSReg, RHSIsKill);
	}

	if (!ResultReg)
	return false;

	// Assign to a GPR since the overflow return value is lowered to a SETcc.
	unsigned ResultReg2 = createResultReg(&X86::GR8RegClass);
	assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
	ResultReg2);

	updateValueMap(II, ResultReg, 2);
	return true;
	}
	case Intrinsic::x86_sse_cvttss2si:
	case Intrinsic::x86_sse_cvttss2si64:
	case Intrinsic::x86_sse2_cvttsd2si:
	case Intrinsic::x86_sse2_cvttsd2si64: {
	bool IsInputDouble;
	switch (II->getIntrinsicID()) {
	default: llvm_unreachable("Unexpected intrinsic.");
	case Intrinsic::x86_sse_cvttss2si:
	case Intrinsic::x86_sse_cvttss2si64:
	if (!Subtarget->hasSSE1())
	return false;
	IsInputDouble = false;
	break;
	case Intrinsic::x86_sse2_cvttsd2si:
	case Intrinsic::x86_sse2_cvttsd2si64:
	if (!Subtarget->hasSSE2())
	return false;
	IsInputDouble = true;
	break;
	}

	Type *RetTy = II->getCalledFunction()->getReturnType();
	MVT VT;
	if (!isTypeLegal(RetTy, VT))
	return false;

	static const uint16_t CvtOpc[2][2][2] = {
	{ { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr },
	{ X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } },
	{ { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr },
	{ X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } }
	};
	bool HasAVX = Subtarget->hasAVX();
	unsigned Opc;
	switch (VT.SimpleTy) {
	default: llvm_unreachable("Unexpected result type.");
	case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
	case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
	}

	// Check if we can fold insertelement instructions into the convert.
	const Value *Op = II->getArgOperand(0);
	while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
	const Value *Index = IE->getOperand(2);
	if (!isa<ConstantInt>(Index))
	break;
	unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();

	if (Idx == 0) {
	Op = IE->getOperand(1);
	break;
	}
	Op = IE->getOperand(0);
	}

	unsigned Reg = getRegForValue(Op);
	if (Reg == 0)
	return false;

	unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
	.addReg(Reg);

	updateValueMap(II, ResultReg);
	return true;
	}
	}
	}

	bool X86FastISel::fastLowerArguments() {
	if (!FuncInfo.CanLowerReturn)
	return false;

	const Function *F = FuncInfo.Fn;
	if (F->isVarArg())
	return false;

	CallingConv::ID CC = F->getCallingConv();
	if (CC != CallingConv::C)
	return false;

	if (Subtarget->isCallingConvWin64(CC))
	return false;

	if (!Subtarget->is64Bit())
	return false;

	if (Subtarget->useSoftFloat())
	return false;

	// Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
	unsigned GPRCnt = 0;
	unsigned FPRCnt = 0;
	for (auto const &Arg : F->args()) {
	if (Arg.hasAttribute(Attribute::ByVal) \|\|
	Arg.hasAttribute(Attribute::InReg) \|\|
	Arg.hasAttribute(Attribute::StructRet) \|\|
	Arg.hasAttribute(Attribute::SwiftSelf) \|\|
	Arg.hasAttribute(Attribute::SwiftError) \|\|
	Arg.hasAttribute(Attribute::Nest))
	return false;

	Type *ArgTy = Arg.getType();
	if (ArgTy->isStructTy() \|\| ArgTy->isArrayTy() \|\| ArgTy->isVectorTy())
	return false;

	EVT ArgVT = TLI.getValueType(DL, ArgTy);
	if (!ArgVT.isSimple()) return false;
	switch (ArgVT.getSimpleVT().SimpleTy) {
	default: return false;
	case MVT::i32:
	case MVT::i64:
	++GPRCnt;
	break;
	case MVT::f32:
	case MVT::f64:
	if (!Subtarget->hasSSE1())
	return false;
	++FPRCnt;
	break;
	}

	if (GPRCnt > 6)
	return false;

	if (FPRCnt > 8)
	return false;
	}

	static const MCPhysReg GPR32ArgRegs[] = {
	X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
	};
	static const MCPhysReg GPR64ArgRegs[] = {
	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
	};
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};

	unsigned GPRIdx = 0;
	unsigned FPRIdx = 0;
	for (auto const &Arg : F->args()) {
	MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
	const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
	unsigned SrcReg;
	switch (VT.SimpleTy) {
	default: llvm_unreachable("Unexpected value type.");
	case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
	case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
	case MVT::f32: LLVM_FALLTHROUGH;
	case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
	}
	unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
	// FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
	// Without this, EmitLiveInCopies may eliminate the livein if its only
	// use is a bitcast (which isn't turned into an instruction).
	unsigned ResultReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg)
	.addReg(DstReg, getKillRegState(true));
	updateValueMap(&Arg, ResultReg);
	}
	return true;
	}

	static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
	CallingConv::ID CC,
	ImmutableCallSite *CS) {
	if (Subtarget->is64Bit())
	return 0;
	if (Subtarget->getTargetTriple().isOSMSVCRT())
	return 0;
	if (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
	CC == CallingConv::HiPE)
	return 0;

	if (CS)
	if (CS->arg_empty() \|\| !CS->paramHasAttr(0, Attribute::StructRet) \|\|
	CS->paramHasAttr(0, Attribute::InReg) \|\| Subtarget->isTargetMCU())
	return 0;

	return 4;
	}

	bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
	auto &OutVals = CLI.OutVals;
	auto &OutFlags = CLI.OutFlags;
	auto &OutRegs = CLI.OutRegs;
	auto &Ins = CLI.Ins;
	auto &InRegs = CLI.InRegs;
	CallingConv::ID CC = CLI.CallConv;
	bool &IsTailCall = CLI.IsTailCall;
	bool IsVarArg = CLI.IsVarArg;
	const Value *Callee = CLI.Callee;
	MCSymbol *Symbol = CLI.Symbol;

	bool Is64Bit = Subtarget->is64Bit();
	bool IsWin64 = Subtarget->isCallingConvWin64(CC);

	const CallInst *CI =
	CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
	const Function *CalledFn = CI ? CI->getCalledFunction() : nullptr;

	// Functions with no_caller_saved_registers that need special handling.
	if ((CI && CI->hasFnAttr("no_caller_saved_registers")) \|\|
	(CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers")))
	return false;

	+ // Functions using retpoline should use SDISel for calls.
	+ if (Subtarget->useRetpoline())
	+ return false;
	+
	// Handle only C, fastcc, and webkit_js calling conventions for now.
	switch (CC) {
	default: return false;
	case CallingConv::C:
	case CallingConv::Fast:
	case CallingConv::WebKit_JS:
	case CallingConv::Swift:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::Win64:
	case CallingConv::X86_64_SysV:
	break;
	}

	// Allow SelectionDAG isel to handle tail calls.
	if (IsTailCall)
	return false;

	// fastcc with -tailcallopt is intended to provide a guaranteed
	// tail call optimization. Fastisel doesn't know how to do that.
	if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
	return false;

	// Don't know how to handle Win64 varargs yet. Nothing special needed for
	// x86-32. Special handling for x86-64 is implemented.
	if (IsVarArg && IsWin64)
	return false;

	// Don't know about inalloca yet.
	if (CLI.CS && CLI.CS->hasInAllocaArgument())
	return false;

	for (auto Flag : CLI.OutFlags)
	if (Flag.isSwiftError())
	return false;

	SmallVector<MVT, 16> OutVTs;
	SmallVector<unsigned, 16> ArgRegs;

	// If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
	// instruction. This is safe because it is common to all FastISel supported
	// calling conventions on x86.
	for (int i = 0, e = OutVals.size(); i != e; ++i) {
	Value *&Val = OutVals[i];
	ISD::ArgFlagsTy Flags = OutFlags[i];
	if (auto *CI = dyn_cast<ConstantInt>(Val)) {
	if (CI->getBitWidth() < 32) {
	if (Flags.isSExt())
	Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
	else
	Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
	}
	}

	// Passing bools around ends up doing a trunc to i1 and passing it.
	// Codegen this as an argument + "and 1".
	MVT VT;
	auto *TI = dyn_cast<TruncInst>(Val);
	unsigned ResultReg;
	if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
	(TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
	TI->hasOneUse()) {
	Value *PrevVal = TI->getOperand(0);
	ResultReg = getRegForValue(PrevVal);

	if (!ResultReg)
	return false;

	if (!isTypeLegal(PrevVal->getType(), VT))
	return false;

	ResultReg =
	fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
	} else {
	if (!isTypeLegal(Val->getType(), VT))
	return false;
	ResultReg = getRegForValue(Val);
	}

	if (!ResultReg)
	return false;

	ArgRegs.push_back(ResultReg);
	OutVTs.push_back(VT);
	}

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());

	// Allocate shadow area for Win64
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();

	// Issue CALLSEQ_START
	unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
	.addImm(NumBytes).addImm(0).addImm(0);

	// Walk the register/memloc assignments, inserting copies/loads.
	const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign const &VA = ArgLocs[i];
	const Value *ArgVal = OutVals[VA.getValNo()];
	MVT ArgVT = OutVTs[VA.getValNo()];

	if (ArgVT == MVT::x86mmx)
	return false;

	unsigned ArgReg = ArgRegs[VA.getValNo()];

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	case CCValAssign::Full: break;
	case CCValAssign::SExt: {
	assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
	"Unexpected extend");

	if (ArgVT == MVT::i1)
	return false;

	bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
	ArgVT, ArgReg);
	assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
	ArgVT = VA.getLocVT();
	break;
	}
	case CCValAssign::ZExt: {
	assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
	"Unexpected extend");

	// Handle zero-extension from i1 to i8, which is common.
	if (ArgVT == MVT::i1) {
	// Set the high bits to zero.
	ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /TODO: Kill=/false);
	ArgVT = MVT::i8;

	if (ArgReg == 0)
	return false;
	}

	bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
	ArgVT, ArgReg);
	assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
	ArgVT = VA.getLocVT();
	break;
	}
	case CCValAssign::AExt: {
	assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
	"Unexpected extend");
	bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
	ArgVT, ArgReg);
	if (!Emitted)
	Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
	ArgVT, ArgReg);
	if (!Emitted)
	Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
	ArgVT, ArgReg);

	assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
	ArgVT = VA.getLocVT();
	break;
	}
	case CCValAssign::BCvt: {
	ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
	/TODO: Kill=/false);
	assert(ArgReg && "Failed to emit a bitcast!");
	ArgVT = VA.getLocVT();
	break;
	}
	case CCValAssign::VExt:
	// VExt has not been implemented, so this should be impossible to reach
	// for now. However, fallback to Selection DAG isel once implemented.
	return false;
	case CCValAssign::AExtUpper:
	case CCValAssign::SExtUpper:
	case CCValAssign::ZExtUpper:
	case CCValAssign::FPExt:
	llvm_unreachable("Unexpected loc info!");
	case CCValAssign::Indirect:
	// FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
	// support this.
	return false;
	}

	if (VA.isRegLoc()) {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
	OutRegs.push_back(VA.getLocReg());
	} else {
	assert(VA.isMemLoc());

	// Don't emit stores for undef values.
	if (isa<UndefValue>(ArgVal))
	continue;

	unsigned LocMemOffset = VA.getLocMemOffset();
	X86AddressMode AM;
	AM.Base.Reg = RegInfo->getStackRegister();
	AM.Disp = LocMemOffset;
	ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
	unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
	MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
	MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
	MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
	if (Flags.isByVal()) {
	X86AddressMode SrcAM;
	SrcAM.Base.Reg = ArgReg;
	if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
	return false;
	} else if (isa<ConstantInt>(ArgVal) \|\| isa<ConstantPointerNull>(ArgVal)) {
	// If this is a really simple value, emit this with the Value* version
	// of X86FastEmitStore. If it isn't simple, we don't want to do this,
	// as it can cause us to reevaluate the argument.
	if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
	return false;
	} else {
	bool ValIsKill = hasTrivialKill(ArgVal);
	if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
	return false;
	}
	}
	}

	// ELF / PIC requires GOT in the EBX register before function calls via PLT
	// GOT pointer.
	if (Subtarget->isPICStyleGOT()) {
	unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
	}

	if (Is64Bit && IsVarArg && !IsWin64) {
	// From AMD64 ABI document:
	// For calls that may call functions that use varargs or stdargs
	// (prototype-less calls or calls to functions containing ellipsis (...) in
	// the declaration) %al is used as hidden argument to specify the number
	// of SSE registers used. The contents of %al do not need to match exactly
	// the number of registers, but must be an ubound on the number of SSE
	// registers used and is in the range 0 - 8 inclusive.

	// Count the number of XMM registers allocated.
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
	assert((Subtarget->hasSSE1() \|\| !NumXMMRegs)
	&& "SSE registers cannot be used when SSE is disabled");
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
	X86::AL).addImm(NumXMMRegs);
	}

	// Materialize callee address in a register. FIXME: GV address can be
	// handled with a CALLpcrel32 instead.
	X86AddressMode CalleeAM;
	if (!X86SelectCallAddress(Callee, CalleeAM))
	return false;

	unsigned CalleeOp = 0;
	const GlobalValue *GV = nullptr;
	if (CalleeAM.GV != nullptr) {
	GV = CalleeAM.GV;
	} else if (CalleeAM.Base.Reg != 0) {
	CalleeOp = CalleeAM.Base.Reg;
	} else
	return false;

	// Issue the call.
	MachineInstrBuilder MIB;
	if (CalleeOp) {
	// Register-indirect call.
	unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
	MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
	.addReg(CalleeOp);
	} else {
	// Direct call.
	assert(GV && "Not a direct call");
	// See if we need any target-specific flags on the GV operand.
	unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV);

	// This will be a direct call, or an indirect call through memory for
	// NonLazyBind calls or dllimport calls.
	bool NeedLoad =
	OpFlags == X86II::MO_DLLIMPORT \|\| OpFlags == X86II::MO_GOTPCREL;
	unsigned CallOpc = NeedLoad
	? (Is64Bit ? X86::CALL64m : X86::CALL32m)
	: (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);

	MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
	if (NeedLoad)
	MIB.addReg(Is64Bit ? X86::RIP : 0).addImm(1).addReg(0);
	if (Symbol)
	MIB.addSym(Symbol, OpFlags);
	else
	MIB.addGlobalAddress(GV, 0, OpFlags);
	if (NeedLoad)
	MIB.addReg(0);
	}

	// Add a register mask operand representing the call-preserved registers.
	// Proper defs for return values will be added by setPhysRegsDeadExcept().
	MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));

	// Add an implicit use GOT pointer in EBX.
	if (Subtarget->isPICStyleGOT())
	MIB.addReg(X86::EBX, RegState::Implicit);

	if (Is64Bit && IsVarArg && !IsWin64)
	MIB.addReg(X86::AL, RegState::Implicit);

	// Add implicit physical register uses to the call.
	for (auto Reg : OutRegs)
	MIB.addReg(Reg, RegState::Implicit);

	// Issue CALLSEQ_END
	unsigned NumBytesForCalleeToPop =
	X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
	TM.Options.GuaranteedTailCallOpt)
	? NumBytes // Callee pops everything.
	: computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CS);
	unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
	.addImm(NumBytes).addImm(NumBytesForCalleeToPop);

	// Now handle call return values.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
	CLI.RetTy->getContext());
	CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);

	// Copy all of the result registers out of their specified physreg.
	unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign &VA = RVLocs[i];
	EVT CopyVT = VA.getValVT();
	unsigned CopyReg = ResultReg + i;
	unsigned SrcReg = VA.getLocReg();

	// If this is x86-64, and we disabled SSE, we can't return FP values
	if ((CopyVT == MVT::f32 \|\| CopyVT == MVT::f64) &&
	((Is64Bit \|\| Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
	report_fatal_error("SSE register return with SSE disabled");
	}

	// If we prefer to use the value in xmm registers, copy it out as f80 and
	// use a truncate to move it from fp stack reg to xmm reg.
	if ((SrcReg == X86::FP0 \|\| SrcReg == X86::FP1) &&
	isScalarFPTypeInSSEReg(VA.getValVT())) {
	CopyVT = MVT::f80;
	CopyReg = createResultReg(&X86::RFP80RegClass);
	}

	// Copy out the result.
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), CopyReg).addReg(SrcReg);
	InRegs.push_back(VA.getLocReg());

	// Round the f80 to the right size, which also moves it to the appropriate
	// xmm register. This is accomplished by storing the f80 value in memory
	// and then loading it back.
	if (CopyVT != VA.getValVT()) {
	EVT ResVT = VA.getValVT();
	unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
	unsigned MemSize = ResVT.getSizeInBits()/8;
	int FI = MFI.CreateStackObject(MemSize, MemSize, false);
	addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc)), FI)
	.addReg(CopyReg);
	Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
	addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc), ResultReg + i), FI);
	}
	}

	CLI.ResultReg = ResultReg;
	CLI.NumResultRegs = RVLocs.size();
	CLI.Call = MIB;

	return true;
	}

	bool
	X86FastISel::fastSelectInstruction(const Instruction *I) {
	switch (I->getOpcode()) {
	default: break;
	case Instruction::Load:
	return X86SelectLoad(I);
	case Instruction::Store:
	return X86SelectStore(I);
	case Instruction::Ret:
	return X86SelectRet(I);
	case Instruction::ICmp:
	case Instruction::FCmp:
	return X86SelectCmp(I);
	case Instruction::ZExt:
	return X86SelectZExt(I);
	case Instruction::SExt:
	return X86SelectSExt(I);
	case Instruction::Br:
	return X86SelectBranch(I);
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::Shl:
	return X86SelectShift(I);
	case Instruction::SDiv:
	case Instruction::UDiv:
	case Instruction::SRem:
	case Instruction::URem:
	return X86SelectDivRem(I);
	case Instruction::Select:
	return X86SelectSelect(I);
	case Instruction::Trunc:
	return X86SelectTrunc(I);
	case Instruction::FPExt:
	return X86SelectFPExt(I);
	case Instruction::FPTrunc:
	return X86SelectFPTrunc(I);
	case Instruction::SIToFP:
	return X86SelectSIToFP(I);
	case Instruction::IntToPtr: // Deliberate fall-through.
	case Instruction::PtrToInt: {
	EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
	EVT DstVT = TLI.getValueType(DL, I->getType());
	if (DstVT.bitsGT(SrcVT))
	return X86SelectZExt(I);
	if (DstVT.bitsLT(SrcVT))
	return X86SelectTrunc(I);
	unsigned Reg = getRegForValue(I->getOperand(0));
	if (Reg == 0) return false;
	updateValueMap(I, Reg);
	return true;
	}
	case Instruction::BitCast: {
	// Select SSE2/AVX bitcasts between 128/256 bit vector types.
	if (!Subtarget->hasSSE2())
	return false;

	EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
	EVT DstVT = TLI.getValueType(DL, I->getType());

	if (!SrcVT.isSimple() \|\| !DstVT.isSimple())
	return false;

	MVT SVT = SrcVT.getSimpleVT();
	MVT DVT = DstVT.getSimpleVT();

	if (!SVT.is128BitVector() &&
	!(Subtarget->hasAVX() && SVT.is256BitVector()) &&
	!(Subtarget->hasAVX512() && SVT.is512BitVector() &&
	(Subtarget->hasBWI() \|\| (SVT.getScalarSizeInBits() >= 32 &&
	DVT.getScalarSizeInBits() >= 32))))
	return false;

	unsigned Reg = getRegForValue(I->getOperand(0));
	if (Reg == 0)
	return false;

	// No instruction is needed for conversion. Reuse the register used by
	// the fist operand.
	updateValueMap(I, Reg);
	return true;
	}
	}

	return false;
	}

	unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
	if (VT > MVT::i64)
	return 0;

	uint64_t Imm = CI->getZExtValue();
	if (Imm == 0) {
	unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
	switch (VT.SimpleTy) {
	default: llvm_unreachable("Unexpected value type");
	case MVT::i1:
	case MVT::i8:
	return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /Kill=/true,
	X86::sub_8bit);
	case MVT::i16:
	return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /Kill=/true,
	X86::sub_16bit);
	case MVT::i32:
	return SrcReg;
	case MVT::i64: {
	unsigned ResultReg = createResultReg(&X86::GR64RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
	.addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
	return ResultReg;
	}
	}
	}

	unsigned Opc = 0;
	switch (VT.SimpleTy) {
	default: llvm_unreachable("Unexpected value type");
	case MVT::i1:
	// TODO: Support this properly.
	if (Subtarget->hasAVX512())
	return 0;
	VT = MVT::i8;
	LLVM_FALLTHROUGH;
	case MVT::i8: Opc = X86::MOV8ri; break;
	case MVT::i16: Opc = X86::MOV16ri; break;
	case MVT::i32: Opc = X86::MOV32ri; break;
	case MVT::i64: {
	if (isUInt<32>(Imm))
	Opc = X86::MOV32ri;
	else if (isInt<32>(Imm))
	Opc = X86::MOV64ri32;
	else
	Opc = X86::MOV64ri;
	break;
	}
	}
	if (VT == MVT::i64 && Opc == X86::MOV32ri) {
	unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm);
	unsigned ResultReg = createResultReg(&X86::GR64RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
	.addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
	return ResultReg;
	}
	return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
	}

	unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
	if (CFP->isNullValue())
	return fastMaterializeFloatZero(CFP);

	// Can't handle alternate code models yet.
	CodeModel::Model CM = TM.getCodeModel();
	if (CM != CodeModel::Small && CM != CodeModel::Large)
	return 0;

	// Get opcode and regclass of the output for the given load instruction.
	unsigned Opc = 0;
	const TargetRegisterClass *RC = nullptr;
	switch (VT.SimpleTy) {
	default: return 0;
	case MVT::f32:
	if (X86ScalarSSEf32) {
	Opc = Subtarget->hasAVX512()
	? X86::VMOVSSZrm
	: Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
	RC = Subtarget->hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
	} else {
	Opc = X86::LD_Fp32m;
	RC = &X86::RFP32RegClass;
	}
	break;
	case MVT::f64:
	if (X86ScalarSSEf64) {
	Opc = Subtarget->hasAVX512()
	? X86::VMOVSDZrm
	: Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
	RC = Subtarget->hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
	} else {
	Opc = X86::LD_Fp64m;
	RC = &X86::RFP64RegClass;
	}
	break;
	case MVT::f80:
	// No f80 support yet.
	return 0;
	}

	// MachineConstantPool wants an explicit alignment.
	unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
	if (Align == 0) {
	// Alignment of vector types. FIXME!
	Align = DL.getTypeAllocSize(CFP->getType());
	}

	// x86-32 PIC requires a PIC base register for constant pools.
	unsigned PICBase = 0;
	unsigned char OpFlag = Subtarget->classifyLocalReference(nullptr);
	if (OpFlag == X86II::MO_PIC_BASE_OFFSET)
	PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
	else if (OpFlag == X86II::MO_GOTOFF)
	PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
	else if (Subtarget->is64Bit() && TM.getCodeModel() == CodeModel::Small)
	PICBase = X86::RIP;

	// Create the load from the constant pool.
	unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
	unsigned ResultReg = createResultReg(RC);

	if (CM == CodeModel::Large) {
	unsigned AddrReg = createResultReg(&X86::GR64RegClass);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
	AddrReg)
	.addConstantPoolIndex(CPI, 0, OpFlag);
	MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc), ResultReg);
	addDirectMem(MIB, AddrReg);
	MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
	MachinePointerInfo::getConstantPool(*FuncInfo.MF),
	MachineMemOperand::MOLoad, DL.getPointerSize(), Align);
	MIB->addMemOperand(*FuncInfo.MF, MMO);
	return ResultReg;
	}

	addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc), ResultReg),
	CPI, PICBase, OpFlag);
	return ResultReg;
	}

	unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
	// Can't handle alternate code models yet.
	if (TM.getCodeModel() != CodeModel::Small)
	return 0;

	// Materialize addresses with LEA/MOV instructions.
	X86AddressMode AM;
	if (X86SelectAddress(GV, AM)) {
	// If the expression is just a basereg, then we're done, otherwise we need
	// to emit an LEA.
	if (AM.BaseType == X86AddressMode::RegBase &&
	AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
	return AM.Base.Reg;

	unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
	if (TM.getRelocationModel() == Reloc::Static &&
	TLI.getPointerTy(DL) == MVT::i64) {
	// The displacement code could be more than 32 bits away so we need to use
	// an instruction with a 64 bit immediate
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
	ResultReg)
	.addGlobalAddress(GV);
	} else {
	unsigned Opc =
	TLI.getPointerTy(DL) == MVT::i32
	? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
	: X86::LEA64r;
	addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc), ResultReg), AM);
	}
	return ResultReg;
	}
	return 0;
	}

	unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
	EVT CEVT = TLI.getValueType(DL, C->getType(), true);

	// Only handle simple types.
	if (!CEVT.isSimple())
	return 0;
	MVT VT = CEVT.getSimpleVT();

	if (const auto *CI = dyn_cast<ConstantInt>(C))
	return X86MaterializeInt(CI, VT);
	else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
	return X86MaterializeFP(CFP, VT);
	else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
	return X86MaterializeGV(GV, VT);

	return 0;
	}

	unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
	// Fail on dynamic allocas. At this point, getRegForValue has already
	// checked its CSE maps, so if we're here trying to handle a dynamic
	// alloca, we're not going to succeed. X86SelectAddress has a
	// check for dynamic allocas, because it's called directly from
	// various places, but targetMaterializeAlloca also needs a check
	// in order to avoid recursion between getRegForValue,
	// X86SelectAddrss, and targetMaterializeAlloca.
	if (!FuncInfo.StaticAllocaMap.count(C))
	return 0;
	assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");

	X86AddressMode AM;
	if (!X86SelectAddress(C, AM))
	return 0;
	unsigned Opc =
	TLI.getPointerTy(DL) == MVT::i32
	? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
	: X86::LEA64r;
	const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
	unsigned ResultReg = createResultReg(RC);
	addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(Opc), ResultReg), AM);
	return ResultReg;
	}

	unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
	MVT VT;
	if (!isTypeLegal(CF->getType(), VT))
	return 0;

	// Get opcode and regclass for the given zero.
	bool HasAVX512 = Subtarget->hasAVX512();
	unsigned Opc = 0;
	const TargetRegisterClass *RC = nullptr;
	switch (VT.SimpleTy) {
	default: return 0;
	case MVT::f32:
	if (X86ScalarSSEf32) {
	Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS;
	RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass;
	} else {
	Opc = X86::LD_Fp032;
	RC = &X86::RFP32RegClass;
	}
	break;
	case MVT::f64:
	if (X86ScalarSSEf64) {
	Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD;
	RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass;
	} else {
	Opc = X86::LD_Fp064;
	RC = &X86::RFP64RegClass;
	}
	break;
	case MVT::f80:
	// No f80 support yet.
	return 0;
	}

	unsigned ResultReg = createResultReg(RC);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
	return ResultReg;
	}


	bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
	const LoadInst *LI) {
	const Value *Ptr = LI->getPointerOperand();
	X86AddressMode AM;
	if (!X86SelectAddress(Ptr, AM))
	return false;

	const X86InstrInfo &XII = (const X86InstrInfo &)TII;

	unsigned Size = DL.getTypeAllocSize(LI->getType());
	unsigned Alignment = LI->getAlignment();

	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = DL.getABITypeAlignment(LI->getType());

	SmallVector<MachineOperand, 8> AddrOps;
	AM.getFullAddress(AddrOps);

	MachineInstr *Result = XII.foldMemoryOperandImpl(
	FuncInfo.MF, MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment,
	/AllowCommute=/true);
	if (!Result)
	return false;

	// The index register could be in the wrong register class. Unfortunately,
	// foldMemoryOperandImpl could have commuted the instruction so its not enough
	// to just look at OpNo + the offset to the index reg. We actually need to
	// scan the instruction to find the index reg and see if its the correct reg
	// class.
	unsigned OperandNo = 0;
	for (MachineInstr::mop_iterator I = Result->operands_begin(),
	E = Result->operands_end(); I != E; ++I, ++OperandNo) {
	MachineOperand &MO = *I;
	if (!MO.isReg() \|\| MO.isDef() \|\| MO.getReg() != AM.IndexReg)
	continue;
	// Found the index reg, now try to rewrite it.
	unsigned IndexReg = constrainOperandRegClass(Result->getDesc(),
	MO.getReg(), OperandNo);
	if (IndexReg == MO.getReg())
	continue;
	MO.setReg(IndexReg);
	}

	Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
	MI->eraseFromParent();
	return true;
	}

	unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,
	const TargetRegisterClass *RC,
	unsigned Op0, bool Op0IsKill,
	unsigned Op1, bool Op1IsKill,
	unsigned Op2, bool Op2IsKill,
	unsigned Op3, bool Op3IsKill) {
	const MCInstrDesc &II = TII.get(MachineInstOpcode);

	unsigned ResultReg = createResultReg(RC);
	Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
	Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
	Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
	Op3 = constrainOperandRegClass(II, Op3, II.getNumDefs() + 3);

	if (II.getNumDefs() >= 1)
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
	.addReg(Op0, getKillRegState(Op0IsKill))
	.addReg(Op1, getKillRegState(Op1IsKill))
	.addReg(Op2, getKillRegState(Op2IsKill))
	.addReg(Op3, getKillRegState(Op3IsKill));
	else {
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
	.addReg(Op0, getKillRegState(Op0IsKill))
	.addReg(Op1, getKillRegState(Op1IsKill))
	.addReg(Op2, getKillRegState(Op2IsKill))
	.addReg(Op3, getKillRegState(Op3IsKill));
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
	TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
	}
	return ResultReg;
	}


	namespace llvm {
	FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) {
	return new X86FastISel(funcInfo, libInfo);
	}
	}
	Index: head/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp (revision 328817)
	@@ -1,3052 +1,3061 @@
	//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the X86 implementation of TargetFrameLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "X86FrameLowering.h"
	#include "X86InstrBuilder.h"
	#include "X86InstrInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86Subtarget.h"
	#include "X86TargetMachine.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/Function.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Target/TargetOptions.h"
	#include <cstdlib>

	using namespace llvm;

	X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
	unsigned StackAlignOverride)
	: TargetFrameLowering(StackGrowsDown, StackAlignOverride,
	STI.is64Bit() ? -8 : -4),
	STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
	// Cache a bunch of frame-related predicates for this subtarget.
	SlotSize = TRI->getSlotSize();
	Is64Bit = STI.is64Bit();
	IsLP64 = STI.isTarget64BitLP64();
	// standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
	Uses64BitFramePtr = STI.isTarget64BitLP64() \|\| STI.isTargetNaCl64();
	StackPtr = TRI->getStackRegister();
	}

	bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
	return !MF.getFrameInfo().hasVarSizedObjects() &&
	!MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
	}

	/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
	/// call frame pseudos can be simplified. Having a FP, as in the default
	/// implementation, is not sufficient here since we can't always use it.
	/// Use a more nuanced condition.
	bool
	X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
	return hasReservedCallFrame(MF) \|\|
	(hasFP(MF) && !TRI->needsStackRealignment(MF)) \|\|
	TRI->hasBasePointer(MF);
	}

	// needsFrameIndexResolution - Do we need to perform FI resolution for
	// this function. Normally, this is required only when the function
	// has any stack objects. However, FI resolution actually has another job,
	// not apparent from the title - it resolves callframesetup/destroy
	// that were not simplified earlier.
	// So, this is required for x86 functions that have push sequences even
	// when there are no stack objects.
	bool
	X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
	return MF.getFrameInfo().hasStackObjects() \|\|
	MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
	}

	/// hasFP - Return true if the specified function should have a dedicated frame
	/// pointer register. This is true if the function has variable sized allocas
	/// or if frame pointer elimination is disabled.
	bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	return (MF.getTarget().Options.DisableFramePointerElim(MF) \|\|
	TRI->needsStackRealignment(MF) \|\|
	MFI.hasVarSizedObjects() \|\|
	MFI.isFrameAddressTaken() \|\| MFI.hasOpaqueSPAdjustment() \|\|
	MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() \|\|
	MF.callsUnwindInit() \|\| MF.hasEHFunclets() \|\| MF.callsEHReturn() \|\|
	MFI.hasStackMap() \|\| MFI.hasPatchPoint() \|\|
	MFI.hasCopyImplyingStackAdjustment());
	}

	static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
	if (IsLP64) {
	if (isInt<8>(Imm))
	return X86::SUB64ri8;
	return X86::SUB64ri32;
	} else {
	if (isInt<8>(Imm))
	return X86::SUB32ri8;
	return X86::SUB32ri;
	}
	}

	static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
	if (IsLP64) {
	if (isInt<8>(Imm))
	return X86::ADD64ri8;
	return X86::ADD64ri32;
	} else {
	if (isInt<8>(Imm))
	return X86::ADD32ri8;
	return X86::ADD32ri;
	}
	}

	static unsigned getSUBrrOpcode(unsigned isLP64) {
	return isLP64 ? X86::SUB64rr : X86::SUB32rr;
	}

	static unsigned getADDrrOpcode(unsigned isLP64) {
	return isLP64 ? X86::ADD64rr : X86::ADD32rr;
	}

	static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
	if (IsLP64) {
	if (isInt<8>(Imm))
	return X86::AND64ri8;
	return X86::AND64ri32;
	}
	if (isInt<8>(Imm))
	return X86::AND32ri8;
	return X86::AND32ri;
	}

	static unsigned getLEArOpcode(unsigned IsLP64) {
	return IsLP64 ? X86::LEA64r : X86::LEA32r;
	}

	/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
	/// when it reaches the "return" instruction. We can then pop a stack object
	/// to this register without worry about clobbering it.
	static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator &MBBI,
	const X86RegisterInfo *TRI,
	bool Is64Bit) {
	const MachineFunction *MF = MBB.getParent();
	if (MF->callsEHReturn())
	return 0;

	const TargetRegisterClass &AvailableRegs = TRI->getGPRsForTailCall(MF);

	if (MBBI == MBB.end())
	return 0;

	switch (MBBI->getOpcode()) {
	default: return 0;
	case TargetOpcode::PATCHABLE_RET:
	case X86::RET:
	case X86::RETL:
	case X86::RETQ:
	case X86::RETIL:
	case X86::RETIQ:
	case X86::TCRETURNdi:
	case X86::TCRETURNri:
	case X86::TCRETURNmi:
	case X86::TCRETURNdi64:
	case X86::TCRETURNri64:
	case X86::TCRETURNmi64:
	case X86::EH_RETURN:
	case X86::EH_RETURN64: {
	SmallSet<uint16_t, 8> Uses;
	for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
	MachineOperand &MO = MBBI->getOperand(i);
	if (!MO.isReg() \|\| MO.isDef())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
	Uses.insert(*AI);
	}

	for (auto CS : AvailableRegs)
	if (!Uses.count(CS) && CS != X86::RIP)
	return CS;
	}
	}

	return 0;
	}

	static bool isEAXLiveIn(MachineBasicBlock &MBB) {
	for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
	unsigned Reg = RegMask.PhysReg;

	if (Reg == X86::RAX \|\| Reg == X86::EAX \|\| Reg == X86::AX \|\|
	Reg == X86::AH \|\| Reg == X86::AL)
	return true;
	}

	return false;
	}

	/// Check if the flags need to be preserved before the terminators.
	/// This would be the case, if the eflags is live-in of the region
	/// composed by the terminators or live-out of that region, without
	/// being defined by a terminator.
	static bool
	flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
	for (const MachineInstr &MI : MBB.terminators()) {
	bool BreakNext = false;
	for (const MachineOperand &MO : MI.operands()) {
	if (!MO.isReg())
	continue;
	unsigned Reg = MO.getReg();
	if (Reg != X86::EFLAGS)
	continue;

	// This terminator needs an eflags that is not defined
	// by a previous another terminator:
	// EFLAGS is live-in of the region composed by the terminators.
	if (!MO.isDef())
	return true;
	// This terminator defines the eflags, i.e., we don't need to preserve it.
	// However, we still need to check this specific terminator does not
	// read a live-in value.
	BreakNext = true;
	}
	// We found a definition of the eflags, no need to preserve them.
	if (BreakNext)
	return false;
	}

	// None of the terminators use or define the eflags.
	// Check if they are live-out, that would imply we need to preserve them.
	for (const MachineBasicBlock *Succ : MBB.successors())
	if (Succ->isLiveIn(X86::EFLAGS))
	return true;

	return false;
	}

	/// emitSPUpdate - Emit a series of instructions to increment / decrement the
	/// stack pointer by a constant value.
	void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator &MBBI,
	int64_t NumBytes, bool InEpilogue) const {
	bool isSub = NumBytes < 0;
	uint64_t Offset = isSub ? -NumBytes : NumBytes;
	MachineInstr::MIFlag Flag =
	isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;

	uint64_t Chunk = (1LL << 31) - 1;
	DebugLoc DL = MBB.findDebugLoc(MBBI);

	if (Offset > Chunk) {
	// Rather than emit a long series of instructions for large offsets,
	// load the offset into a register and do one sub/add
	unsigned Reg = 0;
	unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);

	if (isSub && !isEAXLiveIn(MBB))
	Reg = Rax;
	else
	Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);

	unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
	unsigned AddSubRROpc =
	isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit);
	if (Reg) {
	BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg)
	.addImm(Offset)
	.setMIFlag(Flag);
	MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)
	.addReg(StackPtr)
	.addReg(Reg);
	MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
	return;
	} else if (Offset > 8 * Chunk) {
	// If we would need more than 8 add or sub instructions (a >16GB stack
	// frame), it's worth spilling RAX to materialize this immediate.
	// pushq %rax
	// movabsq +-$Offset+-SlotSize, %rax
	// addq %rsp, %rax
	// xchg %rax, (%rsp)
	// movq (%rsp), %rsp
	assert(Is64Bit && "can't have 32-bit 16GB stack frame");
	BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
	.addReg(Rax, RegState::Kill)
	.setMIFlag(Flag);
	// Subtract is not commutative, so negate the offset and always use add.
	// Subtract 8 less and add 8 more to account for the PUSH we just did.
	if (isSub)
	Offset = -(Offset - SlotSize);
	else
	Offset = Offset + SlotSize;
	BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax)
	.addImm(Offset)
	.setMIFlag(Flag);
	MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)
	.addReg(Rax)
	.addReg(StackPtr);
	MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
	// Exchange the new SP in RAX with the top of the stack.
	addRegOffset(
	BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax),
	StackPtr, false, 0);
	// Load new SP from the top of the stack into RSP.
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr),
	StackPtr, false, 0);
	return;
	}
	}

	while (Offset) {
	uint64_t ThisVal = std::min(Offset, Chunk);
	if (ThisVal == SlotSize) {
	// Use push / pop for slot sized adjustments as a size optimization. We
	// need to find a dead register when using pop.
	unsigned Reg = isSub
	? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
	: findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
	if (Reg) {
	unsigned Opc = isSub
	? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
	: (Is64Bit ? X86::POP64r : X86::POP32r);
	BuildMI(MBB, MBBI, DL, TII.get(Opc))
	.addReg(Reg, getDefRegState(!isSub) \| getUndefRegState(isSub))
	.setMIFlag(Flag);
	Offset -= ThisVal;
	continue;
	}
	}

	BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue)
	.setMIFlag(Flag);

	Offset -= ThisVal;
	}
	}

	MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, int64_t Offset, bool InEpilogue) const {
	assert(Offset != 0 && "zero offset stack adjustment requested");

	// On Atom, using LEA to adjust SP is preferred, but using it in the epilogue
	// is tricky.
	bool UseLEA;
	if (!InEpilogue) {
	// Check if inserting the prologue at the beginning
	// of MBB would require to use LEA operations.
	// We need to use LEA operations if EFLAGS is live in, because
	// it means an instruction will read it before it gets defined.
	UseLEA = STI.useLeaForSP() \|\| MBB.isLiveIn(X86::EFLAGS);
	} else {
	// If we can use LEA for SP but we shouldn't, check that none
	// of the terminators uses the eflags. Otherwise we will insert
	// a ADD that will redefine the eflags and break the condition.
	// Alternatively, we could move the ADD, but this may not be possible
	// and is an optimization anyway.
	UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());
	if (UseLEA && !STI.useLeaForSP())
	UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
	// If that assert breaks, that means we do not do the right thing
	// in canUseAsEpilogue.
	assert((UseLEA \|\| !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
	"We shouldn't have allowed this insertion point");
	}

	MachineInstrBuilder MI;
	if (UseLEA) {
	MI = addRegOffset(BuildMI(MBB, MBBI, DL,
	TII.get(getLEArOpcode(Uses64BitFramePtr)),
	StackPtr),
	StackPtr, false, Offset);
	} else {
	bool IsSub = Offset < 0;
	uint64_t AbsOffset = IsSub ? -Offset : Offset;
	unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset)
	: getADDriOpcode(Uses64BitFramePtr, AbsOffset);
	MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
	.addReg(StackPtr)
	.addImm(AbsOffset);
	MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
	}
	return MI;
	}

	int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator &MBBI,
	bool doMergeWithPrevious) const {
	if ((doMergeWithPrevious && MBBI == MBB.begin()) \|\|
	(!doMergeWithPrevious && MBBI == MBB.end()))
	return 0;

	MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
	MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
	: std::next(MBBI);
	PI = skipDebugInstructionsBackward(PI, MBB.begin());
	if (NI != nullptr)
	NI = skipDebugInstructionsForward(NI, MBB.end());

	unsigned Opc = PI->getOpcode();
	int Offset = 0;

	if (!doMergeWithPrevious && NI != MBB.end() &&
	NI->getOpcode() == TargetOpcode::CFI_INSTRUCTION) {
	// Don't merge with the next instruction if it has CFI.
	return Offset;
	}

	if ((Opc == X86::ADD64ri32 \|\| Opc == X86::ADD64ri8 \|\|
	Opc == X86::ADD32ri \|\| Opc == X86::ADD32ri8) &&
	PI->getOperand(0).getReg() == StackPtr){
	assert(PI->getOperand(1).getReg() == StackPtr);
	Offset += PI->getOperand(2).getImm();
	MBB.erase(PI);
	if (!doMergeWithPrevious) MBBI = NI;
	} else if ((Opc == X86::LEA32r \|\| Opc == X86::LEA64_32r) &&
	PI->getOperand(0).getReg() == StackPtr &&
	PI->getOperand(1).getReg() == StackPtr &&
	PI->getOperand(2).getImm() == 1 &&
	PI->getOperand(3).getReg() == X86::NoRegister &&
	PI->getOperand(5).getReg() == X86::NoRegister) {
	// For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
	Offset += PI->getOperand(4).getImm();
	MBB.erase(PI);
	if (!doMergeWithPrevious) MBBI = NI;
	} else if ((Opc == X86::SUB64ri32 \|\| Opc == X86::SUB64ri8 \|\|
	Opc == X86::SUB32ri \|\| Opc == X86::SUB32ri8) &&
	PI->getOperand(0).getReg() == StackPtr) {
	assert(PI->getOperand(1).getReg() == StackPtr);
	Offset -= PI->getOperand(2).getImm();
	MBB.erase(PI);
	if (!doMergeWithPrevious) MBBI = NI;
	}

	return Offset;
	}

	void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL,
	const MCCFIInstruction &CFIInst) const {
	MachineFunction &MF = *MBB.getParent();
	unsigned CFIIndex = MF.addFrameInst(CFIInst);
	BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex);
	}

	void X86FrameLowering::emitCalleeSavedFrameMoves(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL) const {
	MachineFunction &MF = *MBB.getParent();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MachineModuleInfo &MMI = MF.getMMI();
	const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();

	// Add callee saved registers to move list.
	const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
	if (CSI.empty()) return;

	// Calculate offsets.
	for (std::vector<CalleeSavedInfo>::const_iterator
	I = CSI.begin(), E = CSI.end(); I != E; ++I) {
	int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
	unsigned Reg = I->getReg();

	unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
	BuildCFI(MBB, MBBI, DL,
	MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
	}
	}

	void X86FrameLowering::emitStackProbe(MachineFunction &MF,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, bool InProlog) const {
	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
	if (STI.isTargetWindowsCoreCLR()) {
	if (InProlog) {
	emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
	} else {
	emitStackProbeInline(MF, MBB, MBBI, DL, false);
	}
	} else {
	emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
	}
	}

	void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
	MachineBasicBlock &PrologMBB) const {
	const StringRef ChkStkStubSymbol = "__chkstk_stub";
	MachineInstr *ChkStkStub = nullptr;

	for (MachineInstr &MI : PrologMBB) {
	if (MI.isCall() && MI.getOperand(0).isSymbol() &&
	ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
	ChkStkStub = &MI;
	break;
	}
	}

	if (ChkStkStub != nullptr) {
	assert(!ChkStkStub->isBundled() &&
	"Not expecting bundled instructions here");
	MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
	assert(std::prev(MBBI) == ChkStkStub &&
	"MBBI expected after __chkstk_stub.");
	DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
	emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
	ChkStkStub->eraseFromParent();
	}
	}

	void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL,
	bool InProlog) const {
	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
	assert(STI.is64Bit() && "different expansion needed for 32 bit");
	assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
	const TargetInstrInfo &TII = *STI.getInstrInfo();
	const BasicBlock *LLVM_BB = MBB.getBasicBlock();

	// RAX contains the number of bytes of desired stack adjustment.
	// The handling here assumes this value has already been updated so as to
	// maintain stack alignment.
	//
	// We need to exit with RSP modified by this amount and execute suitable
	// page touches to notify the OS that we're growing the stack responsibly.
	// All stack probing must be done without modifying RSP.
	//
	// MBB:
	// SizeReg = RAX;
	// ZeroReg = 0
	// CopyReg = RSP
	// Flags, TestReg = CopyReg - SizeReg
	// FinalReg = !Flags.Ovf ? TestReg : ZeroReg
	// LimitReg = gs magic thread env access
	// if FinalReg >= LimitReg goto ContinueMBB
	// RoundBB:
	// RoundReg = page address of FinalReg
	// LoopMBB:
	// LoopReg = PHI(LimitReg,ProbeReg)
	// ProbeReg = LoopReg - PageSize
	// [ProbeReg] = 0
	// if (ProbeReg > RoundReg) goto LoopMBB
	// ContinueMBB:
	// RSP = RSP - RAX
	// [rest of original MBB]

	// Set up the new basic blocks
	MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
	MF.insert(MBBIter, RoundMBB);
	MF.insert(MBBIter, LoopMBB);
	MF.insert(MBBIter, ContinueMBB);

	// Split MBB and move the tail portion down to ContinueMBB.
	MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
	ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
	ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);

	// Some useful constants
	const int64_t ThreadEnvironmentStackLimit = 0x10;
	const int64_t PageSize = 0x1000;
	const int64_t PageMask = ~(PageSize - 1);

	// Registers we need. For the normal case we use virtual
	// registers. For the prolog expansion we use RAX, RCX and RDX.
	MachineRegisterInfo &MRI = MF.getRegInfo();
	const TargetRegisterClass *RegClass = &X86::GR64RegClass;
	const unsigned SizeReg = InProlog ? (unsigned)X86::RAX
	: MRI.createVirtualRegister(RegClass),
	ZeroReg = InProlog ? (unsigned)X86::RCX
	: MRI.createVirtualRegister(RegClass),
	CopyReg = InProlog ? (unsigned)X86::RDX
	: MRI.createVirtualRegister(RegClass),
	TestReg = InProlog ? (unsigned)X86::RDX
	: MRI.createVirtualRegister(RegClass),
	FinalReg = InProlog ? (unsigned)X86::RDX
	: MRI.createVirtualRegister(RegClass),
	RoundedReg = InProlog ? (unsigned)X86::RDX
	: MRI.createVirtualRegister(RegClass),
	LimitReg = InProlog ? (unsigned)X86::RCX
	: MRI.createVirtualRegister(RegClass),
	JoinReg = InProlog ? (unsigned)X86::RCX
	: MRI.createVirtualRegister(RegClass),
	ProbeReg = InProlog ? (unsigned)X86::RCX
	: MRI.createVirtualRegister(RegClass);

	// SP-relative offsets where we can save RCX and RDX.
	int64_t RCXShadowSlot = 0;
	int64_t RDXShadowSlot = 0;

	// If inlining in the prolog, save RCX and RDX.
	// Future optimization: don't save or restore if not live in.
	if (InProlog) {
	// Compute the offsets. We need to account for things already
	// pushed onto the stack at this point: return address, frame
	// pointer (if used), and callee saves.
	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
	const bool HasFP = hasFP(MF);
	RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
	RDXShadowSlot = RCXShadowSlot + 8;
	// Emit the saves.
	addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
	RCXShadowSlot)
	.addReg(X86::RCX);
	addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
	RDXShadowSlot)
	.addReg(X86::RDX);
	} else {
	// Not in the prolog. Copy RAX to a virtual reg.
	BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
	}

	// Add code to MBB to check for overflow and set the new target stack pointer
	// to zero if so.
	BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
	.addReg(ZeroReg, RegState::Undef)
	.addReg(ZeroReg, RegState::Undef);
	BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
	BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
	.addReg(CopyReg)
	.addReg(SizeReg);
	BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg)
	.addReg(TestReg)
	.addReg(ZeroReg);

	// FinalReg now holds final stack pointer value, or zero if
	// allocation would overflow. Compare against the current stack
	// limit from the thread environment block. Note this limit is the
	// lowest touched page on the stack, not the point at which the OS
	// will cause an overflow exception, so this is just an optimization
	// to avoid unnecessarily touching pages that are below the current
	// SP but already committed to the stack by the OS.
	BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
	.addReg(0)
	.addImm(1)
	.addReg(0)
	.addImm(ThreadEnvironmentStackLimit)
	.addReg(X86::GS);
	BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
	// Jump if the desired stack pointer is at or above the stack limit.
	BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB);

	// Add code to roundMBB to round the final stack pointer to a page boundary.
	BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
	.addReg(FinalReg)
	.addImm(PageMask);
	BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);

	// LimitReg now holds the current stack limit, RoundedReg page-rounded
	// final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
	// and probe until we reach RoundedReg.
	if (!InProlog) {
	BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
	.addReg(LimitReg)
	.addMBB(RoundMBB)
	.addReg(ProbeReg)
	.addMBB(LoopMBB);
	}

	addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
	false, -PageSize);

	// Probe by storing a byte onto the stack.
	BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
	.addReg(ProbeReg)
	.addImm(1)
	.addReg(0)
	.addImm(0)
	.addReg(0)
	.addImm(0);
	BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
	.addReg(RoundedReg)
	.addReg(ProbeReg);
	BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB);

	MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();

	// If in prolog, restore RDX and RCX.
	if (InProlog) {
	addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
	X86::RCX),
	X86::RSP, false, RCXShadowSlot);
	addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
	X86::RDX),
	X86::RSP, false, RDXShadowSlot);
	}

	// Now that the probing is done, add code to continueMBB to update
	// the stack pointer for real.
	BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
	.addReg(X86::RSP)
	.addReg(SizeReg);

	// Add the control flow edges we need.
	MBB.addSuccessor(ContinueMBB);
	MBB.addSuccessor(RoundMBB);
	RoundMBB->addSuccessor(LoopMBB);
	LoopMBB->addSuccessor(ContinueMBB);
	LoopMBB->addSuccessor(LoopMBB);

	// Mark all the instructions added to the prolog as frame setup.
	if (InProlog) {
	for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
	BeforeMBBI->setFlag(MachineInstr::FrameSetup);
	}
	for (MachineInstr &MI : *RoundMBB) {
	MI.setFlag(MachineInstr::FrameSetup);
	}
	for (MachineInstr &MI : *LoopMBB) {
	MI.setFlag(MachineInstr::FrameSetup);
	}
	for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin();
	CMBBI != ContinueMBBI; ++CMBBI) {
	CMBBI->setFlag(MachineInstr::FrameSetup);
	}
	}

	// Possible TODO: physreg liveness for InProlog case.
	}

	void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL,
	bool InProlog) const {
	bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;

	+ // FIXME: Add retpoline support and remove this.
	+ if (Is64Bit && IsLargeCodeModel && STI.useRetpoline())
	+ report_fatal_error("Emitting stack probe calls on 64-bit with the large "
	+ "code model and retpoline not yet implemented.");
	+
	unsigned CallOp;
	if (Is64Bit)
	CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
	else
	CallOp = X86::CALLpcrel32;

	StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF);

	MachineInstrBuilder CI;
	MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);

	// All current stack probes take AX and SP as input, clobber flags, and
	// preserve all registers. x86_64 probes leave RSP unmodified.
	if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
	// For the large code model, we have to call through a register. Use R11,
	// as it is scratch in all supported calling conventions.
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
	.addExternalSymbol(MF.createExternalSymbolName(Symbol));
	CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
	} else {
	CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp))
	.addExternalSymbol(MF.createExternalSymbolName(Symbol));
	}

	unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
	unsigned SP = Is64Bit ? X86::RSP : X86::ESP;
	CI.addReg(AX, RegState::Implicit)
	.addReg(SP, RegState::Implicit)
	.addReg(AX, RegState::Define \| RegState::Implicit)
	.addReg(SP, RegState::Define \| RegState::Implicit)
	.addReg(X86::EFLAGS, RegState::Define \| RegState::Implicit);

	if (STI.isTargetWin64() \|\| !STI.isOSWindows()) {
	// MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
	// MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
	// themselves. They also does not clobber %rax so we can reuse it when
	// adjusting %rsp.
	// All other platforms do not specify a particular ABI for the stack probe
	// function, so we arbitrarily define it to not adjust %esp/%rsp itself.
	BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Is64Bit)), SP)
	.addReg(SP)
	.addReg(AX);
	}

	if (InProlog) {
	// Apply the frame setup flag to all inserted instrs.
	for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
	ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
	}
	}

	void X86FrameLowering::emitStackProbeInlineStub(
	MachineFunction &MF, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {

	assert(InProlog && "ChkStkStub called outside prolog!");

	BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
	.addExternalSymbol("__chkstk_stub");
	}

	static unsigned calculateSetFPREG(uint64_t SPAdjust) {
	// Win64 ABI has a less restrictive limitation of 240; 128 works equally well
	// and might require smaller successive adjustments.
	const uint64_t Win64MaxSEHOffset = 128;
	uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset);
	// Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode.
	return SEHFrameOffset & -16;
	}

	// If we're forcing a stack realignment we can't rely on just the frame
	// info, we need to know the ABI stack alignment as well in case we
	// have a call out. Otherwise just make sure we have some alignment - we'll
	// go with the minimum SlotSize.
	uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	uint64_t MaxAlign = MFI.getMaxAlignment(); // Desired stack alignment.
	unsigned StackAlign = getStackAlignment();
	if (MF.getFunction().hasFnAttribute("stackrealign")) {
	if (MFI.hasCalls())
	MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
	else if (MaxAlign < SlotSize)
	MaxAlign = SlotSize;
	}
	return MaxAlign;
	}

	void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, unsigned Reg,
	uint64_t MaxAlign) const {
	uint64_t Val = -MaxAlign;
	unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
	MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
	.addReg(Reg)
	.addImm(Val)
	.setMIFlag(MachineInstr::FrameSetup);

	// The EFLAGS implicit def is dead.
	MI->getOperand(3).setIsDead();
	}

	/// emitPrologue - Push callee-saved registers onto the stack, which
	/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
	/// space for local variables. Also emit labels used by the exception handler to
	/// generate the exception handling frames.

	/*
	Here's a gist of what gets emitted:

	; Establish frame pointer, if needed
	[if needs FP]
	push %rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
	.seh_pushreg %rpb
	mov %rsp, %rbp
	.cfi_def_cfa_register %rbp

	; Spill general-purpose registers
	[for all callee-saved GPRs]
	pushq %<reg>
	[if not needs FP]
	.cfi_def_cfa_offset (offset from RETADDR)
	.seh_pushreg %<reg>

	; If the required stack alignment > default stack alignment
	; rsp needs to be re-aligned. This creates a "re-alignment gap"
	; of unknown size in the stack frame.
	[if stack needs re-alignment]
	and $MASK, %rsp

	; Allocate space for locals
	[if target is Windows and allocated space > 4096 bytes]
	; Windows needs special care for allocations larger
	; than one page.
	mov $NNN, %rax
	call ___chkstk_ms/___chkstk
	sub %rax, %rsp
	[else]
	sub $NNN, %rsp

	[if needs FP]
	.seh_stackalloc (size of XMM spill slots)
	.seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
	[else]
	.seh_stackalloc NNN

	; Spill XMMs
	; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
	; they may get spilled on any platform, if the current function
	; calls @llvm.eh.unwind.init
	[if needs FP]
	[for all callee-saved XMM registers]
	movaps %<xmm reg>, -MMM(%rbp)
	[for all callee-saved XMM registers]
	.seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
	; i.e. the offset relative to (%rbp - SEHFrameOffset)
	[else]
	[for all callee-saved XMM registers]
	movaps %<xmm reg>, KKK(%rsp)
	[for all callee-saved XMM registers]
	.seh_savexmm %<xmm reg>, KKK

	.seh_endprologue

	[if needs base pointer]
	mov %rsp, %rbx
	[if needs to restore base pointer]
	mov %rsp, -MMM(%rbp)

	; Emit CFI info
	[if needs FP]
	[for all callee-saved registers]
	.cfi_offset %<reg>, (offset from %rbp)
	[else]
	.cfi_def_cfa_offset (offset from RETADDR)
	[for all callee-saved registers]
	.cfi_offset %<reg>, (offset from %rsp)

	Notes:
	- .seh directives are emitted only for Windows 64 ABI
	- .cv_fpo directives are emitted on win32 when emitting CodeView
	- .cfi directives are emitted for all other ABIs
	- for 32-bit code, substitute %e?? registers for %r??
	*/

	void X86FrameLowering::emitPrologue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
	"MF used frame lowering for wrong subtarget");
	MachineBasicBlock::iterator MBBI = MBB.begin();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const Function &Fn = MF.getFunction();
	MachineModuleInfo &MMI = MF.getMMI();
	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
	uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
	bool IsFunclet = MBB.isEHFuncletEntry();
	EHPersonality Personality = EHPersonality::Unknown;
	if (Fn.hasPersonalityFn())
	Personality = classifyEHPersonality(Fn.getPersonalityFn());
	bool FnHasClrFunclet =
	MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
	bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
	bool HasFP = hasFP(MF);
	bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv());
	bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
	bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry();
	// FIXME: Emit FPO data for EH funclets.
	bool NeedsWinFPO =
	!IsFunclet && STI.isTargetWin32() && MMI.getModule()->getCodeViewFlag();
	bool NeedsWinCFI = NeedsWin64CFI \|\| NeedsWinFPO;
	bool NeedsDwarfCFI =
	!IsWin64Prologue && (MMI.hasDebugInfo() \|\| Fn.needsUnwindTableEntry());
	unsigned FramePtr = TRI->getFrameRegister(MF);
	const unsigned MachineFramePtr =
	STI.isTarget64BitILP32()
	? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
	unsigned BasePtr = TRI->getBaseRegister();
	bool HasWinCFI = false;

	// Debug location must be unknown since the first debug location is used
	// to determine the end of the prologue.
	DebugLoc DL;

	// Add RETADDR move area to callee saved frame size.
	int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
	if (TailCallReturnAddrDelta && IsWin64Prologue)
	report_fatal_error("Can't handle guaranteed tail call under win64 yet");

	if (TailCallReturnAddrDelta < 0)
	X86FI->setCalleeSavedFrameSize(
	X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);

	bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty();

	// The default stack probe size is 4096 if the function has no stackprobesize
	// attribute.
	unsigned StackProbeSize = 4096;
	if (Fn.hasFnAttribute("stack-probe-size"))
	Fn.getFnAttribute("stack-probe-size")
	.getValueAsString()
	.getAsInteger(0, StackProbeSize);

	// Re-align the stack on 64-bit if the x86-interrupt calling convention is
	// used and an error code was pushed, since the x86-64 ABI requires a 16-byte
	// stack alignment.
	if (Fn.getCallingConv() == CallingConv::X86_INTR && Is64Bit &&
	Fn.arg_size() == 2) {
	StackSize += 8;
	MFI.setStackSize(StackSize);
	emitSPUpdate(MBB, MBBI, -8, /InEpilogue=/false);
	}

	// If this is x86-64 and the Red Zone is not disabled, if we are a leaf
	// function, and use up to 128 bytes of stack space, don't have a frame
	// pointer, calls, or dynamic alloca then we do not need to adjust the
	// stack pointer (we fit in the Red Zone). We also check that we don't
	// push and pop from the stack.
	if (Is64Bit && !Fn.hasFnAttribute(Attribute::NoRedZone) &&
	!TRI->needsStackRealignment(MF) &&
	!MFI.hasVarSizedObjects() && // No dynamic alloca.
	!MFI.adjustsStack() && // No calls.
	!UseStackProbe && // No stack probes.
	!IsWin64CC && // Win64 has no Red Zone
	!MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
	!MF.shouldSplitStack()) { // Regular stack
	uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
	if (HasFP) MinSize += SlotSize;
	X86FI->setUsesRedZone(MinSize > 0 \|\| StackSize > 0);
	StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
	MFI.setStackSize(StackSize);
	}

	// Insert stack pointer adjustment for later moving of return addr. Only
	// applies to tail call optimized functions where the callee argument stack
	// size is bigger than the callers.
	if (TailCallReturnAddrDelta < 0) {
	BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta,
	/InEpilogue=/false)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	// Mapping for machine moves:
	//
	// DST: VirtualFP AND
	// SRC: VirtualFP => DW_CFA_def_cfa_offset
	// ELSE => DW_CFA_def_cfa
	//
	// SRC: VirtualFP AND
	// DST: Register => DW_CFA_def_cfa_register
	//
	// ELSE
	// OFFSET < 0 => DW_CFA_offset_extended_sf
	// REG < 64 => DW_CFA_offset + Reg
	// ELSE => DW_CFA_offset_extended

	uint64_t NumBytes = 0;
	int stackGrowth = -SlotSize;

	// Find the funclet establisher parameter
	unsigned Establisher = X86::NoRegister;
	if (IsClrFunclet)
	Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
	else if (IsFunclet)
	Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;

	if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
	// Immediately spill establisher into the home slot.
	// The runtime cares about this.
	// MOV64mr %rdx, 16(%rsp)
	unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16)
	.addReg(Establisher)
	.setMIFlag(MachineInstr::FrameSetup);
	MBB.addLiveIn(Establisher);
	}

	if (HasFP) {
	assert(MF.getRegInfo().isReserved(MachineFramePtr) && "FP reserved");

	// Calculate required stack adjustment.
	uint64_t FrameSize = StackSize - SlotSize;
	// If required, include space for extra hidden slot for stashing base pointer.
	if (X86FI->getRestoreBasePointer())
	FrameSize += SlotSize;

	NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();

	// Callee-saved registers are pushed on stack before the stack is realigned.
	if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
	NumBytes = alignTo(NumBytes, MaxAlign);

	// Get the offset of the stack slot for the EBP register, which is
	// guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
	// Update the frame offset adjustment.
	if (!IsFunclet)
	MFI.setOffsetAdjustment(-NumBytes);
	else
	assert(MFI.getOffsetAdjustment() == -(int)NumBytes &&
	"should calculate same local variable offset for funclets");

	// Save EBP/RBP into the appropriate stack slot.
	BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
	.addReg(MachineFramePtr, RegState::Kill)
	.setMIFlag(MachineInstr::FrameSetup);

	if (NeedsDwarfCFI) {
	// Mark the place where EBP/RBP was saved.
	// Define the current CFA rule to use the provided offset.
	assert(StackSize);
	BuildCFI(MBB, MBBI, DL,
	MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));

	// Change the rule for the FramePtr to be an "offset" rule.
	unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
	BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset(
	nullptr, DwarfFramePtr, 2 * stackGrowth));
	}

	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
	.addImm(FramePtr)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	if (!IsWin64Prologue && !IsFunclet) {
	// Update EBP with the new base value.
	BuildMI(MBB, MBBI, DL,
	TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
	FramePtr)
	.addReg(StackPtr)
	.setMIFlag(MachineInstr::FrameSetup);

	if (NeedsDwarfCFI) {
	// Mark effective beginning of when frame pointer becomes valid.
	// Define the current CFA to use the EBP/RBP register.
	unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
	BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister(
	nullptr, DwarfFramePtr));
	}

	if (NeedsWinFPO) {
	// .cv_fpo_setframe $FramePtr
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
	.addImm(FramePtr)
	.addImm(0)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}
	} else {
	assert(!IsFunclet && "funclets without FPs not yet implemented");
	NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
	}

	// For EH funclets, only allocate enough space for outgoing calls. Save the
	// NumBytes value that we would've used for the parent frame.
	unsigned ParentFrameNumBytes = NumBytes;
	if (IsFunclet)
	NumBytes = getWinEHFuncletFrameSize(MF);

	// Skip the callee-saved push instructions.
	bool PushedRegs = false;
	int StackOffset = 2 * stackGrowth;

	while (MBBI != MBB.end() &&
	MBBI->getFlag(MachineInstr::FrameSetup) &&
	(MBBI->getOpcode() == X86::PUSH32r \|\|
	MBBI->getOpcode() == X86::PUSH64r)) {
	PushedRegs = true;
	unsigned Reg = MBBI->getOperand(0).getReg();
	++MBBI;

	if (!HasFP && NeedsDwarfCFI) {
	// Mark callee-saved push instruction.
	// Define the current CFA rule to use the provided offset.
	assert(StackSize);
	BuildCFI(MBB, MBBI, DL,
	MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
	StackOffset += stackGrowth;
	}

	if (NeedsWinCFI) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
	.addImm(Reg)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}

	// Realign stack after we pushed callee-saved registers (so that we'll be
	// able to calculate their offsets from the frame pointer).
	// Don't do this for Win64, it needs to realign the stack after the prologue.
	if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) {
	assert(HasFP && "There should be a frame pointer if stack is realigned.");
	BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
	}

	// If there is an SUB32ri of ESP immediately before this instruction, merge
	// the two. This can be the case when tail call elimination is enabled and
	// the callee has more arguments then the caller.
	NumBytes -= mergeSPUpdates(MBB, MBBI, true);

	// Adjust stack pointer: ESP -= numbytes.

	// Windows and cygwin/mingw require a prologue helper routine when allocating
	// more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw
	// uses __alloca. __alloca and the 32-bit version of __chkstk will probe the
	// stack and adjust the stack pointer in one go. The 64-bit version of
	// __chkstk is only responsible for probing the stack. The 64-bit prologue is
	// responsible for adjusting the stack pointer. Touching the stack at 4K
	// increments is necessary to ensure that the guard pages used by the OS
	// virtual memory manager are allocated in correct sequence.
	uint64_t AlignedNumBytes = NumBytes;
	if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
	AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
	if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
	assert(!X86FI->getUsesRedZone() &&
	"The Red Zone is not accounted for in stack probes");

	// Check whether EAX is livein for this block.
	bool isEAXAlive = isEAXLiveIn(MBB);

	if (isEAXAlive) {
	// Sanity check that EAX is not livein for this function.
	// It should not be, so throw an assert.
	assert(!Is64Bit && "EAX is livein in x64 case!");

	// Save EAX
	BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
	.addReg(X86::EAX, RegState::Kill)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	if (Is64Bit) {
	// Handle the 64-bit Windows ABI case where we need to call __chkstk.
	// Function prologue is responsible for adjusting the stack pointer.
	if (isUInt<32>(NumBytes)) {
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
	.addImm(NumBytes)
	.setMIFlag(MachineInstr::FrameSetup);
	} else if (isInt<32>(NumBytes)) {
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX)
	.addImm(NumBytes)
	.setMIFlag(MachineInstr::FrameSetup);
	} else {
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
	.addImm(NumBytes)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	} else {
	// Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
	// We'll also use 4 already allocated bytes for EAX.
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
	.addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	// Call __chkstk, __chkstk_ms, or __alloca.
	emitStackProbe(MF, MBB, MBBI, DL, true);

	if (isEAXAlive) {
	// Restore EAX
	MachineInstr *MI =
	addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
	StackPtr, false, NumBytes - 4);
	MI->setFlag(MachineInstr::FrameSetup);
	MBB.insert(MBBI, MI);
	}
	} else if (NumBytes) {
	emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /InEpilogue=/false);
	}

	if (NeedsWinCFI && NumBytes) {
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
	.addImm(NumBytes)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	int SEHFrameOffset = 0;
	unsigned SPOrEstablisher;
	if (IsFunclet) {
	if (IsClrFunclet) {
	// The establisher parameter passed to a CLR funclet is actually a pointer
	// to the (mostly empty) frame of its nearest enclosing funclet; we have
	// to find the root function establisher frame by loading the PSPSym from
	// the intermediate frame.
	unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
	MachinePointerInfo NoInfo;
	MBB.addLiveIn(Establisher);
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
	Establisher, false, PSPSlotOffset)
	.addMemOperand(MF.getMachineMemOperand(
	NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize));
	;
	// Save the root establisher back into the current funclet's (mostly
	// empty) frame, in case a sub-funclet or the GC needs it.
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
	false, PSPSlotOffset)
	.addReg(Establisher)
	.addMemOperand(
	MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore \|
	MachineMemOperand::MOVolatile,
	SlotSize, SlotSize));
	}
	SPOrEstablisher = Establisher;
	} else {
	SPOrEstablisher = StackPtr;
	}

	if (IsWin64Prologue && HasFP) {
	// Set RBP to a small fixed offset from RSP. In the funclet case, we base
	// this calculation on the incoming establisher, which holds the value of
	// RSP from the parent frame at the end of the prologue.
	SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes);
	if (SEHFrameOffset)
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
	SPOrEstablisher, false, SEHFrameOffset);
	else
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr)
	.addReg(SPOrEstablisher);

	// If this is not a funclet, emit the CFI describing our frame pointer.
	if (NeedsWinCFI && !IsFunclet) {
	assert(!NeedsWinFPO && "this setframe incompatible with FPO data");
	HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
	.addImm(FramePtr)
	.addImm(SEHFrameOffset)
	.setMIFlag(MachineInstr::FrameSetup);
	if (isAsynchronousEHPersonality(Personality))
	MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset;
	}
	} else if (IsFunclet && STI.is32Bit()) {
	// Reset EBP / ESI to something good for funclets.
	MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL);
	// If we're a catch funclet, we can be returned to via catchret. Save ESP
	// into the registration node so that the runtime will restore it for us.
	if (!MBB.isCleanupFuncletEntry()) {
	assert(Personality == EHPersonality::MSVC_CXX);
	unsigned FrameReg;
	int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
	int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg);
	// ESP is the first field, so no extra displacement is needed.
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
	false, EHRegOffset)
	.addReg(X86::ESP);
	}
	}

	while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
	const MachineInstr &FrameInstr = *MBBI;
	++MBBI;

	if (NeedsWinCFI) {
	int FI;
	if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
	if (X86::FR64RegClass.contains(Reg)) {
	unsigned IgnoredFrameReg;
	int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
	Offset += SEHFrameOffset;

	HasWinCFI = true;
	assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
	.addImm(Reg)
	.addImm(Offset)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}
	}
	}

	if (NeedsWinCFI && HasWinCFI)
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
	.setMIFlag(MachineInstr::FrameSetup);

	if (FnHasClrFunclet && !IsFunclet) {
	// Save the so-called Initial-SP (i.e. the value of the stack pointer
	// immediately after the prolog) into the PSPSlot so that funclets
	// and the GC can recover it.
	unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
	auto PSPInfo = MachinePointerInfo::getFixedStack(
	MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx);
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false,
	PSPSlotOffset)
	.addReg(StackPtr)
	.addMemOperand(MF.getMachineMemOperand(
	PSPInfo, MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile,
	SlotSize, SlotSize));
	}

	// Realign stack after we spilled callee-saved registers (so that we'll be
	// able to calculate their offsets from the frame pointer).
	// Win64 requires aligning the stack after the prologue.
	if (IsWin64Prologue && TRI->needsStackRealignment(MF)) {
	assert(HasFP && "There should be a frame pointer if stack is realigned.");
	BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign);
	}

	// We already dealt with stack realignment and funclets above.
	if (IsFunclet && STI.is32Bit())
	return;

	// If we need a base pointer, set it up here. It's whatever the value
	// of the stack pointer is at this point. Any variable size objects
	// will be allocated after this, so we can still use the base pointer
	// to reference locals.
	if (TRI->hasBasePointer(MF)) {
	// Update the base pointer with the current stack pointer.
	unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
	BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
	.addReg(SPOrEstablisher)
	.setMIFlag(MachineInstr::FrameSetup);
	if (X86FI->getRestoreBasePointer()) {
	// Stash value of base pointer. Saving RSP instead of EBP shortens
	// dependence chain. Used by SjLj EH.
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
	FramePtr, true, X86FI->getRestoreBasePointerOffset())
	.addReg(SPOrEstablisher)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {
	// Stash the value of the frame pointer relative to the base pointer for
	// Win32 EH. This supports Win32 EH, which does the inverse of the above:
	// it recovers the frame pointer from the base pointer rather than the
	// other way around.
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
	unsigned UsedReg;
	int Offset =
	getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
	assert(UsedReg == BasePtr);
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
	.addReg(FramePtr)
	.setMIFlag(MachineInstr::FrameSetup);
	}
	}

	if (((!HasFP && NumBytes) \|\| PushedRegs) && NeedsDwarfCFI) {
	// Mark end of stack pointer adjustment.
	if (!HasFP && NumBytes) {
	// Define the current CFA rule to use the provided offset.
	assert(StackSize);
	BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
	nullptr, -StackSize + stackGrowth));
	}

	// Emit DWARF info specifying the offsets of the callee-saved registers.
	emitCalleeSavedFrameMoves(MBB, MBBI, DL);
	}

	// X86 Interrupt handling function cannot assume anything about the direction
	// flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction
	// in each prologue of interrupt handler function.
	//
	// FIXME: Create "cld" instruction only in these cases:
	// 1. The interrupt handling function uses any of the "rep" instructions.
	// 2. Interrupt handling function calls another function.
	//
	if (Fn.getCallingConv() == CallingConv::X86_INTR)
	BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
	.setMIFlag(MachineInstr::FrameSetup);

	// At this point we know if the function has WinCFI or not.
	MF.setHasWinCFI(HasWinCFI);
	}

	bool X86FrameLowering::canUseLEAForSPInEpilogue(
	const MachineFunction &MF) const {
	// We can't use LEA instructions for adjusting the stack pointer if we don't
	// have a frame pointer in the Win64 ABI. Only ADD instructions may be used
	// to deallocate the stack.
	// This means that we can use LEA for SP in two situations:
	// 1. We aren't using the Win64 ABI which means we are free to use LEA.
	// 2. We have a frame pointer which means we are permitted to use LEA.
	return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() \|\| hasFP(MF);
	}

	static bool isFuncletReturnInstr(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case X86::CATCHRET:
	case X86::CLEANUPRET:
	return true;
	default:
	return false;
	}
	llvm_unreachable("impossible");
	}

	// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the
	// stack. It holds a pointer to the bottom of the root function frame. The
	// establisher frame pointer passed to a nested funclet may point to the
	// (mostly empty) frame of its parent funclet, but it will need to find
	// the frame of the root function to access locals. To facilitate this,
	// every funclet copies the pointer to the bottom of the root function
	// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the
	// same offset for the PSPSym in the root function frame that's used in the
	// funclets' frames allows each funclet to dynamically accept any ancestor
	// frame as its establisher argument (the runtime doesn't guarantee the
	// immediate parent for some reason lost to history), and also allows the GC,
	// which uses the PSPSym for some bookkeeping, to find it in any funclet's
	// frame with only a single offset reported for the entire method.
	unsigned
	X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
	const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
	unsigned SPReg;
	int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,
	/IgnoreSPUpdates/ true);
	assert(Offset >= 0 && SPReg == TRI->getStackRegister());
	return static_cast<unsigned>(Offset);
	}

	unsigned
	X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
	// This is the size of the pushed CSRs.
	unsigned CSSize =
	MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
	// This is the amount of stack a funclet needs to allocate.
	unsigned UsedSize;
	EHPersonality Personality =
	classifyEHPersonality(MF.getFunction().getPersonalityFn());
	if (Personality == EHPersonality::CoreCLR) {
	// CLR funclets need to hold enough space to include the PSPSym, at the
	// same offset from the stack pointer (immediately after the prolog) as it
	// resides at in the main function.
	UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;
	} else {
	// Other funclets just need enough stack for outgoing call arguments.
	UsedSize = MF.getFrameInfo().getMaxCallFrameSize();
	}
	// RBP is not included in the callee saved register block. After pushing RBP,
	// everything is 16 byte aligned. Everything we allocate before an outgoing
	// call must also be 16 byte aligned.
	unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment());
	// Subtract out the size of the callee saved registers. This is how much stack
	// each funclet will allocate.
	return FrameSizeMinusRBP - CSSize;
	}

	static bool isTailCallOpcode(unsigned Opc) {
	return Opc == X86::TCRETURNri \|\| Opc == X86::TCRETURNdi \|\|
	Opc == X86::TCRETURNmi \|\|
	Opc == X86::TCRETURNri64 \|\| Opc == X86::TCRETURNdi64 \|\|
	Opc == X86::TCRETURNmi64;
	}

	void X86FrameLowering::emitEpilogue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	MachineBasicBlock::iterator Terminator = MBB.getFirstTerminator();
	MachineBasicBlock::iterator MBBI = Terminator;
	DebugLoc DL;
	if (MBBI != MBB.end())
	DL = MBBI->getDebugLoc();
	// standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
	const bool Is64BitILP32 = STI.isTarget64BitILP32();
	unsigned FramePtr = TRI->getFrameRegister(MF);
	unsigned MachineFramePtr =
	Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;

	bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
	bool NeedsWin64CFI =
	IsWin64Prologue && MF.getFunction().needsUnwindTableEntry();
	bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI);

	// Get the number of bytes to allocate from the FrameInfo.
	uint64_t StackSize = MFI.getStackSize();
	uint64_t MaxAlign = calculateMaxStackAlign(MF);
	unsigned CSSize = X86FI->getCalleeSavedFrameSize();
	bool HasFP = hasFP(MF);
	uint64_t NumBytes = 0;

	if (IsFunclet) {
	assert(HasFP && "EH funclets without FP not yet implemented");
	NumBytes = getWinEHFuncletFrameSize(MF);
	} else if (HasFP) {
	// Calculate required stack adjustment.
	uint64_t FrameSize = StackSize - SlotSize;
	NumBytes = FrameSize - CSSize;

	// Callee-saved registers were pushed on stack before the stack was
	// realigned.
	if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
	NumBytes = alignTo(FrameSize, MaxAlign);
	} else {
	NumBytes = StackSize - CSSize;
	}
	uint64_t SEHStackAllocAmt = NumBytes;

	if (HasFP) {
	// Pop EBP.
	BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
	MachineFramePtr)
	.setMIFlag(MachineInstr::FrameDestroy);
	}

	MachineBasicBlock::iterator FirstCSPop = MBBI;
	// Skip the callee-saved pop instructions.
	while (MBBI != MBB.begin()) {
	MachineBasicBlock::iterator PI = std::prev(MBBI);
	unsigned Opc = PI->getOpcode();

	if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
	if ((Opc != X86::POP32r \|\| !PI->getFlag(MachineInstr::FrameDestroy)) &&
	(Opc != X86::POP64r \|\| !PI->getFlag(MachineInstr::FrameDestroy)))
	break;
	FirstCSPop = PI;
	}

	--MBBI;
	}
	MBBI = FirstCSPop;

	if (IsFunclet && Terminator->getOpcode() == X86::CATCHRET)
	emitCatchRetReturnValue(MBB, FirstCSPop, &*Terminator);

	if (MBBI != MBB.end())
	DL = MBBI->getDebugLoc();

	// If there is an ADD32ri or SUB32ri of ESP immediately before this
	// instruction, merge the two instructions.
	if (NumBytes \|\| MFI.hasVarSizedObjects())
	NumBytes += mergeSPUpdates(MBB, MBBI, true);

	// If dynamic alloca is used, then reset esp to point to the last callee-saved
	// slot before popping them off! Same applies for the case, when stack was
	// realigned. Don't do this if this was a funclet epilogue, since the funclets
	// will not do realignment or dynamic stack allocation.
	if ((TRI->needsStackRealignment(MF) \|\| MFI.hasVarSizedObjects()) &&
	!IsFunclet) {
	if (TRI->needsStackRealignment(MF))
	MBBI = FirstCSPop;
	unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);
	uint64_t LEAAmount =
	IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize;

	// There are only two legal forms of epilogue:
	// - add SEHAllocationSize, %rsp
	// - lea SEHAllocationSize(%FramePtr), %rsp
	//
	// 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence.
	// However, we may use this sequence if we have a frame pointer because the
	// effects of the prologue can safely be undone.
	if (LEAAmount != 0) {
	unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
	FramePtr, false, LEAAmount);
	--MBBI;
	} else {
	unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
	BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
	.addReg(FramePtr);
	--MBBI;
	}
	} else if (NumBytes) {
	// Adjust stack pointer back: ESP += numbytes.
	emitSPUpdate(MBB, MBBI, NumBytes, /InEpilogue=/true);
	--MBBI;
	}

	// Windows unwinder will not invoke function's exception handler if IP is
	// either in prologue or in epilogue. This behavior causes a problem when a
	// call immediately precedes an epilogue, because the return address points
	// into the epilogue. To cope with that, we insert an epilogue marker here,
	// then replace it with a 'nop' if it ends up immediately after a CALL in the
	// final emitted code.
	if (NeedsWin64CFI && MF.hasWinCFI())
	BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));

	if (Terminator == MBB.end() \|\| !isTailCallOpcode(Terminator->getOpcode())) {
	// Add the return addr area delta back since we are not tail calling.
	int Offset = -1 * X86FI->getTCReturnAddrDelta();
	assert(Offset >= 0 && "TCDelta should never be positive");
	if (Offset) {
	// Check for possible merge with preceding ADD instruction.
	Offset += mergeSPUpdates(MBB, Terminator, true);
	emitSPUpdate(MBB, Terminator, Offset, /InEpilogue=/true);
	}
	}
	}

	int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
	unsigned &FrameReg) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();

	bool IsFixed = MFI.isFixedObjectIndex(FI);
	// We can't calculate offset from frame pointer if the stack is realigned,
	// so enforce usage of stack/base pointer. The base pointer is used when we
	// have dynamic allocas in addition to dynamic realignment.
	if (TRI->hasBasePointer(MF))
	FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getBaseRegister();
	else if (TRI->needsStackRealignment(MF))
	FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getStackRegister();
	else
	FrameReg = TRI->getFrameRegister(MF);

	// Offset will hold the offset from the stack pointer at function entry to the
	// object.
	// We need to factor in additional offsets applied during the prologue to the
	// frame, base, and stack pointer depending on which is used.
	int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
	const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	unsigned CSSize = X86FI->getCalleeSavedFrameSize();
	uint64_t StackSize = MFI.getStackSize();
	bool HasFP = hasFP(MF);
	bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
	int64_t FPDelta = 0;

	if (IsWin64Prologue) {
	assert(!MFI.hasCalls() \|\| (StackSize % 16) == 8);

	// Calculate required stack adjustment.
	uint64_t FrameSize = StackSize - SlotSize;
	// If required, include space for extra hidden slot for stashing base pointer.
	if (X86FI->getRestoreBasePointer())
	FrameSize += SlotSize;
	uint64_t NumBytes = FrameSize - CSSize;

	uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);
	if (FI && FI == X86FI->getFAIndex())
	return -SEHFrameOffset;

	// FPDelta is the offset from the "traditional" FP location of the old base
	// pointer followed by return address and the location required by the
	// restricted Win64 prologue.
	// Add FPDelta to all offsets below that go through the frame pointer.
	FPDelta = FrameSize - SEHFrameOffset;
	assert((!MFI.hasCalls() \|\| (FPDelta % 16) == 0) &&
	"FPDelta isn't aligned per the Win64 ABI!");
	}


	if (TRI->hasBasePointer(MF)) {
	assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
	if (FI < 0) {
	// Skip the saved EBP.
	return Offset + SlotSize + FPDelta;
	} else {
	assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
	return Offset + StackSize;
	}
	} else if (TRI->needsStackRealignment(MF)) {
	if (FI < 0) {
	// Skip the saved EBP.
	return Offset + SlotSize + FPDelta;
	} else {
	assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
	return Offset + StackSize;
	}
	// FIXME: Support tail calls
	} else {
	if (!HasFP)
	return Offset + StackSize;

	// Skip the saved EBP.
	Offset += SlotSize;

	// Skip the RETADDR move area
	int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
	if (TailCallReturnAddrDelta < 0)
	Offset -= TailCallReturnAddrDelta;
	}

	return Offset + FPDelta;
	}

	int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
	int FI, unsigned &FrameReg,
	int Adjustment) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	FrameReg = TRI->getStackRegister();
	return MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + Adjustment;
	}

	int
	X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
	int FI, unsigned &FrameReg,
	bool IgnoreSPUpdates) const {

	const MachineFrameInfo &MFI = MF.getFrameInfo();
	// Does not include any dynamic realign.
	const uint64_t StackSize = MFI.getStackSize();
	// LLVM arranges the stack as follows:
	// ...
	// ARG2
	// ARG1
	// RETADDR
	// PUSH RBP <-- RBP points here
	// PUSH CSRs
	// ~~~~~~~ <-- possible stack realignment (non-win64)
	// ...
	// STACK OBJECTS
	// ... <-- RSP after prologue points here
	// ~~~~~~~ <-- possible stack realignment (win64)
	//
	// if (hasVarSizedObjects()):
	// ... <-- "base pointer" (ESI/RBX) points here
	// DYNAMIC ALLOCAS
	// ... <-- RSP points here
	//
	// Case 1: In the simple case of no stack realignment and no dynamic
	// allocas, both "fixed" stack objects (arguments and CSRs) are addressable
	// with fixed offsets from RSP.
	//
	// Case 2: In the case of stack realignment with no dynamic allocas, fixed
	// stack objects are addressed with RBP and regular stack objects with RSP.
	//
	// Case 3: In the case of dynamic allocas and stack realignment, RSP is used
	// to address stack arguments for outgoing calls and nothing else. The "base
	// pointer" points to local variables, and RBP points to fixed objects.
	//
	// In cases 2 and 3, we can only answer for non-fixed stack objects, and the
	// answer we give is relative to the SP after the prologue, and not the
	// SP in the middle of the function.

	if (MFI.isFixedObjectIndex(FI) && TRI->needsStackRealignment(MF) &&
	!STI.isTargetWin64())
	return getFrameIndexReference(MF, FI, FrameReg);

	// If !hasReservedCallFrame the function might have SP adjustement in the
	// body. So, even though the offset is statically known, it depends on where
	// we are in the function.
	const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
	if (!IgnoreSPUpdates && !TFI->hasReservedCallFrame(MF))
	return getFrameIndexReference(MF, FI, FrameReg);

	// We don't handle tail calls, and shouldn't be seeing them either.
	assert(MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta() >= 0 &&
	"we don't handle this case!");

	// This is how the math works out:
	//
	// %rsp grows (i.e. gets lower) left to right. Each box below is
	// one word (eight bytes). Obj0 is the stack slot we're trying to
	// get to.
	//
	// ----------------------------------
	// \| BP \| Obj0 \| Obj1 \| ... \| ObjN \|
	// ----------------------------------
	// ^ ^ ^ ^
	// A B C E
	//
	// A is the incoming stack pointer.
	// (B - A) is the local area offset (-8 for x86-64) [1]
	// (C - A) is the Offset returned by MFI.getObjectOffset for Obj0 [2]
	//
	// \|(E - B)\| is the StackSize (absolute value, positive). For a
	// stack that grown down, this works out to be (B - E). [3]
	//
	// E is also the value of %rsp after stack has been set up, and we
	// want (C - E) -- the value we can add to %rsp to get to Obj0. Now
	// (C - E) == (C - A) - (B - A) + (B - E)
	// { Using [1], [2] and [3] above }
	// == getObjectOffset - LocalAreaOffset + StackSize

	return getFrameIndexReferenceSP(MF, FI, FrameReg, StackSize);
	}

	bool X86FrameLowering::assignCalleeSavedSpillSlots(
	MachineFunction &MF, const TargetRegisterInfo *TRI,
	std::vector<CalleeSavedInfo> &CSI) const {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();

	unsigned CalleeSavedFrameSize = 0;
	int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();

	if (hasFP(MF)) {
	// emitPrologue always spills frame register the first thing.
	SpillSlotOffset -= SlotSize;
	MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);

	// Since emitPrologue and emitEpilogue will handle spilling and restoring of
	// the frame register, we can delete it from CSI list and not have to worry
	// about avoiding it later.
	unsigned FPReg = TRI->getFrameRegister(MF);
	for (unsigned i = 0; i < CSI.size(); ++i) {
	if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
	CSI.erase(CSI.begin() + i);
	break;
	}
	}
	}

	// Assign slots for GPRs. It increases frame size.
	for (unsigned i = CSI.size(); i != 0; --i) {
	unsigned Reg = CSI[i - 1].getReg();

	if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
	continue;

	SpillSlotOffset -= SlotSize;
	CalleeSavedFrameSize += SlotSize;

	int SlotIndex = MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
	CSI[i - 1].setFrameIdx(SlotIndex);
	}

	X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);

	// Assign slots for XMMs.
	for (unsigned i = CSI.size(); i != 0; --i) {
	unsigned Reg = CSI[i - 1].getReg();
	if (X86::GR64RegClass.contains(Reg) \|\| X86::GR32RegClass.contains(Reg))
	continue;

	const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
	unsigned Size = TRI->getSpillSize(*RC);
	unsigned Align = TRI->getSpillAlignment(*RC);
	// ensure alignment
	SpillSlotOffset -= std::abs(SpillSlotOffset) % Align;
	// spill into slot
	SpillSlotOffset -= Size;
	int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
	CSI[i - 1].setFrameIdx(SlotIndex);
	MFI.ensureMaxAlignment(Align);
	}

	return true;
	}

	bool X86FrameLowering::spillCalleeSavedRegisters(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
	const std::vector<CalleeSavedInfo> &CSI,
	const TargetRegisterInfo *TRI) const {
	DebugLoc DL = MBB.findDebugLoc(MI);

	// Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
	// for us, and there are no XMM CSRs on Win32.
	if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows())
	return true;

	// Push GPRs. It increases frame size.
	const MachineFunction &MF = *MBB.getParent();
	unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
	for (unsigned i = CSI.size(); i != 0; --i) {
	unsigned Reg = CSI[i - 1].getReg();

	if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
	continue;

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	bool isLiveIn = MRI.isLiveIn(Reg);
	if (!isLiveIn)
	MBB.addLiveIn(Reg);

	// Decide whether we can add a kill flag to the use.
	bool CanKill = !isLiveIn;
	// Check if any subregister is live-in
	if (CanKill) {
	for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) {
	if (MRI.isLiveIn(*AReg)) {
	CanKill = false;
	break;
	}
	}
	}

	// Do not set a kill flag on values that are also marked as live-in. This
	// happens with the @llvm-returnaddress intrinsic and with arguments
	// passed in callee saved registers.
	// Omitting the kill flags is conservatively correct even if the live-in
	// is not used after all.
	BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, getKillRegState(CanKill))
	.setMIFlag(MachineInstr::FrameSetup);
	}

	// Make XMM regs spilled. X86 does not have ability of push/pop XMM.
	// It can be done by spilling XMMs to stack frame.
	for (unsigned i = CSI.size(); i != 0; --i) {
	unsigned Reg = CSI[i-1].getReg();
	if (X86::GR64RegClass.contains(Reg) \|\| X86::GR32RegClass.contains(Reg))
	continue;
	// Add the callee-saved register as live-in. It's killed at the spill.
	MBB.addLiveIn(Reg);
	const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);

	TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
	TRI);
	--MI;
	MI->setFlag(MachineInstr::FrameSetup);
	++MI;
	}

	return true;
	}

	void X86FrameLowering::emitCatchRetReturnValue(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	MachineInstr *CatchRet) const {
	// SEH shouldn't use catchret.
	assert(!isAsynchronousEHPersonality(classifyEHPersonality(
	MBB.getParent()->getFunction().getPersonalityFn())) &&
	"SEH should not use CATCHRET");
	DebugLoc DL = CatchRet->getDebugLoc();
	MachineBasicBlock *CatchRetTarget = CatchRet->getOperand(0).getMBB();

	// Fill EAX/RAX with the address of the target block.
	if (STI.is64Bit()) {
	// LEA64r CatchRetTarget(%rip), %rax
	BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), X86::RAX)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addMBB(CatchRetTarget)
	.addReg(0);
	} else {
	// MOV32ri $CatchRetTarget, %eax
	BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
	.addMBB(CatchRetTarget);
	}

	// Record that we've taken the address of CatchRetTarget and no longer just
	// reference it in a terminator.
	CatchRetTarget->setHasAddressTaken();
	}

	bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	std::vector<CalleeSavedInfo> &CSI,
	const TargetRegisterInfo *TRI) const {
	if (CSI.empty())
	return false;

	if (MI != MBB.end() && isFuncletReturnInstr(*MI) && STI.isOSWindows()) {
	// Don't restore CSRs in 32-bit EH funclets. Matches
	// spillCalleeSavedRegisters.
	if (STI.is32Bit())
	return true;
	// Don't restore CSRs before an SEH catchret. SEH except blocks do not form
	// funclets. emitEpilogue transforms these to normal jumps.
	if (MI->getOpcode() == X86::CATCHRET) {
	const Function &F = MBB.getParent()->getFunction();
	bool IsSEH = isAsynchronousEHPersonality(
	classifyEHPersonality(F.getPersonalityFn()));
	if (IsSEH)
	return true;
	}
	}

	DebugLoc DL = MBB.findDebugLoc(MI);

	// Reload XMMs from stack frame.
	for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
	unsigned Reg = CSI[i].getReg();
	if (X86::GR64RegClass.contains(Reg) \|\|
	X86::GR32RegClass.contains(Reg))
	continue;

	const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
	TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
	}

	// POP GPRs.
	unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
	for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
	unsigned Reg = CSI[i].getReg();
	if (!X86::GR64RegClass.contains(Reg) &&
	!X86::GR32RegClass.contains(Reg))
	continue;

	BuildMI(MBB, MI, DL, TII.get(Opc), Reg)
	.setMIFlag(MachineInstr::FrameDestroy);
	}
	return true;
	}

	void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
	BitVector &SavedRegs,
	RegScavenger *RS) const {
	TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);

	MachineFrameInfo &MFI = MF.getFrameInfo();

	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();

	if (TailCallReturnAddrDelta < 0) {
	// create RETURNADDR area
	// arg
	// arg
	// RETADDR
	// { ...
	// RETADDR area
	// ...
	// }
	// [EBP]
	MFI.CreateFixedObject(-TailCallReturnAddrDelta,
	TailCallReturnAddrDelta - SlotSize, true);
	}

	// Spill the BasePtr if it's used.
	if (TRI->hasBasePointer(MF)) {
	SavedRegs.set(TRI->getBaseRegister());

	// Allocate a spill slot for EBP if we have a base pointer and EH funclets.
	if (MF.hasEHFunclets()) {
	int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize);
	X86FI->setHasSEHFramePtrSave(true);
	X86FI->setSEHFramePtrSaveIndex(FI);
	}
	}
	}

	static bool
	HasNestArgument(const MachineFunction *MF) {
	const Function &F = MF->getFunction();
	for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
	I != E; I++) {
	if (I->hasNestAttr())
	return true;
	}
	return false;
	}

	/// GetScratchRegister - Get a temp register for performing work in the
	/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
	/// and the properties of the function either one or two registers will be
	/// needed. Set primary to true for the first register, false for the second.
	static unsigned
	GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) {
	CallingConv::ID CallingConvention = MF.getFunction().getCallingConv();

	// Erlang stuff.
	if (CallingConvention == CallingConv::HiPE) {
	if (Is64Bit)
	return Primary ? X86::R14 : X86::R13;
	else
	return Primary ? X86::EBX : X86::EDI;
	}

	if (Is64Bit) {
	if (IsLP64)
	return Primary ? X86::R11 : X86::R12;
	else
	return Primary ? X86::R11D : X86::R12D;
	}

	bool IsNested = HasNestArgument(&MF);

	if (CallingConvention == CallingConv::X86_FastCall \|\|
	CallingConvention == CallingConv::Fast) {
	if (IsNested)
	report_fatal_error("Segmented stacks does not support fastcall with "
	"nested function.");
	return Primary ? X86::EAX : X86::ECX;
	}
	if (IsNested)
	return Primary ? X86::EDX : X86::EAX;
	return Primary ? X86::ECX : X86::EAX;
	}

	// The stack limit in the TCB is set to this many bytes above the actual stack
	// limit.
	static const uint64_t kSplitStackAvailable = 256;

	void X86FrameLowering::adjustForSegmentedStacks(
	MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	uint64_t StackSize;
	unsigned TlsReg, TlsOffset;
	DebugLoc DL;

	// To support shrink-wrapping we would need to insert the new blocks
	// at the right place and update the branches to PrologueMBB.
	assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");

	unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
	assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
	"Scratch register is live-in");

	if (MF.getFunction().isVarArg())
	report_fatal_error("Segmented stacks do not support vararg functions.");
	if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
	!STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
	!STI.isTargetDragonFly())
	report_fatal_error("Segmented stacks not supported on this platform.");

	// Eventually StackSize will be calculated by a link-time pass; which will
	// also decide whether checking code needs to be injected into this particular
	// prologue.
	StackSize = MFI.getStackSize();

	// Do not generate a prologue for functions with a stack of size zero
	if (StackSize == 0)
	return;

	MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
	MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	bool IsNested = false;

	// We need to know if the function has a nest argument only in 64 bit mode.
	if (Is64Bit)
	IsNested = HasNestArgument(&MF);

	// The MOV R10, RAX needs to be in a different block, since the RET we emit in
	// allocMBB needs to be last (terminating) instruction.

	for (const auto &LI : PrologueMBB.liveins()) {
	allocMBB->addLiveIn(LI);
	checkMBB->addLiveIn(LI);
	}

	if (IsNested)
	allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);

	MF.push_front(allocMBB);
	MF.push_front(checkMBB);

	// When the frame size is less than 256 we just compare the stack
	// boundary directly to the value of the stack pointer, per gcc.
	bool CompareStackPointer = StackSize < kSplitStackAvailable;

	// Read the limit off the current stacklet off the stack_guard location.
	if (Is64Bit) {
	if (STI.isTargetLinux()) {
	TlsReg = X86::FS;
	TlsOffset = IsLP64 ? 0x70 : 0x40;
	} else if (STI.isTargetDarwin()) {
	TlsReg = X86::GS;
	TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
	} else if (STI.isTargetWin64()) {
	TlsReg = X86::GS;
	TlsOffset = 0x28; // pvArbitrary, reserved for application use
	} else if (STI.isTargetFreeBSD()) {
	TlsReg = X86::FS;
	TlsOffset = 0x18;
	} else if (STI.isTargetDragonFly()) {
	TlsReg = X86::FS;
	TlsOffset = 0x20; // use tls_tcb.tcb_segstack
	} else {
	report_fatal_error("Segmented stacks not supported on this platform.");
	}

	if (CompareStackPointer)
	ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
	else
	BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP)
	.addImm(1).addReg(0).addImm(-StackSize).addReg(0);

	BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg)
	.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
	} else {
	if (STI.isTargetLinux()) {
	TlsReg = X86::GS;
	TlsOffset = 0x30;
	} else if (STI.isTargetDarwin()) {
	TlsReg = X86::GS;
	TlsOffset = 0x48 + 90*4;
	} else if (STI.isTargetWin32()) {
	TlsReg = X86::FS;
	TlsOffset = 0x14; // pvArbitrary, reserved for application use
	} else if (STI.isTargetDragonFly()) {
	TlsReg = X86::FS;
	TlsOffset = 0x10; // use tls_tcb.tcb_segstack
	} else if (STI.isTargetFreeBSD()) {
	report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
	} else {
	report_fatal_error("Segmented stacks not supported on this platform.");
	}

	if (CompareStackPointer)
	ScratchReg = X86::ESP;
	else
	BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
	.addImm(1).addReg(0).addImm(-StackSize).addReg(0);

	if (STI.isTargetLinux() \|\| STI.isTargetWin32() \|\| STI.isTargetWin64() \|\|
	STI.isTargetDragonFly()) {
	BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
	.addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
	} else if (STI.isTargetDarwin()) {

	// TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
	unsigned ScratchReg2;
	bool SaveScratch2;
	if (CompareStackPointer) {
	// The primary scratch register is available for holding the TLS offset.
	ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);
	SaveScratch2 = false;
	} else {
	// Need to use a second register to hold the TLS offset
	ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);

	// Unfortunately, with fastcc the second scratch register may hold an
	// argument.
	SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
	}

	// If Scratch2 is live-in then it needs to be saved.
	assert((!MF.getRegInfo().isLiveIn(ScratchReg2) \|\| SaveScratch2) &&
	"Scratch register is live-in and not saved");

	if (SaveScratch2)
	BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
	.addReg(ScratchReg2, RegState::Kill);

	BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
	.addImm(TlsOffset);
	BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
	.addReg(ScratchReg)
	.addReg(ScratchReg2).addImm(1).addReg(0)
	.addImm(0)
	.addReg(TlsReg);

	if (SaveScratch2)
	BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
	}
	}

	// This jump is taken if SP >= (Stacklet Limit + Stack Space required).
	// It jumps to normal execution of the function body.
	BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&PrologueMBB);

	// On 32 bit we first push the arguments size and then the frame size. On 64
	// bit, we pass the stack frame size in r10 and the argument size in r11.
	if (Is64Bit) {
	// Functions with nested arguments use R10, so it needs to be saved across
	// the call to _morestack

	const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
	const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
	const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
	const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
	const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri;

	if (IsNested)
	BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);

	BuildMI(allocMBB, DL, TII.get(MOVri), Reg10)
	.addImm(StackSize);
	BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
	.addImm(X86FI->getArgumentStackSize());
	} else {
	BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
	.addImm(X86FI->getArgumentStackSize());
	BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
	.addImm(StackSize);
	}

	// __morestack is in libgcc
	if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
	// Under the large code model, we cannot assume that __morestack lives
	// within 2^31 bytes of the call site, so we cannot use pc-relative
	// addressing. We cannot perform the call via a temporary register,
	// as the rax register may be used to store the static chain, and all
	// other suitable registers may be either callee-save or used for
	// parameter passing. We cannot use the stack at this point either
	// because __morestack manipulates the stack directly.
	//
	// To avoid these issues, perform an indirect call via a read-only memory
	// location containing the address.
	//
	// This solution is not perfect, as it assumes that the .rodata section
	// is laid out within 2^31 bytes of each function body, but this seems
	// to be sufficient for JIT.
	+ // FIXME: Add retpoline support and remove the error here..
	+ if (STI.useRetpoline())
	+ report_fatal_error("Emitting morestack calls on 64-bit with the large "
	+ "code model and retpoline not yet implemented.");
	BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addExternalSymbol("__morestack_addr")
	.addReg(0);
	MF.getMMI().setUsesMorestackAddr(true);
	} else {
	if (Is64Bit)
	BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack");
	else
	BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
	.addExternalSymbol("__morestack");
	}

	if (IsNested)
	BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
	else
	BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));

	allocMBB->addSuccessor(&PrologueMBB);

	checkMBB->addSuccessor(allocMBB);
	checkMBB->addSuccessor(&PrologueMBB);

	#ifdef EXPENSIVE_CHECKS
	MF.verify();
	#endif
	}

	/// Lookup an ERTS parameter in the !hipe.literals named metadata node.
	/// HiPE provides Erlang Runtime System-internal parameters, such as PCB offsets
	/// to fields it needs, through a named metadata node "hipe.literals" containing
	/// name-value pairs.
	static unsigned getHiPELiteral(
	NamedMDNode *HiPELiteralsMD, const StringRef LiteralName) {
	for (int i = 0, e = HiPELiteralsMD->getNumOperands(); i != e; ++i) {
	MDNode *Node = HiPELiteralsMD->getOperand(i);
	if (Node->getNumOperands() != 2) continue;
	MDString *NodeName = dyn_cast<MDString>(Node->getOperand(0));
	ValueAsMetadata *NodeVal = dyn_cast<ValueAsMetadata>(Node->getOperand(1));
	if (!NodeName \|\| !NodeVal) continue;
	ConstantInt *ValConst = dyn_cast_or_null<ConstantInt>(NodeVal->getValue());
	if (ValConst && NodeName->getString() == LiteralName) {
	return ValConst->getZExtValue();
	}
	}

	report_fatal_error("HiPE literal " + LiteralName
	+ " required but not provided");
	}

	/// Erlang programs may need a special prologue to handle the stack size they
	/// might need at runtime. That is because Erlang/OTP does not implement a C
	/// stack but uses a custom implementation of hybrid stack/heap architecture.
	/// (for more information see Eric Stenman's Ph.D. thesis:
	/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
	///
	/// CheckStack:
	/// temp0 = sp - MaxStack
	/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
	/// OldStart:
	/// ...
	/// IncStack:
	/// call inc_stack # doubles the stack space
	/// temp0 = sp - MaxStack
	/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
	void X86FrameLowering::adjustForHiPEPrologue(
	MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	DebugLoc DL;

	// To support shrink-wrapping we would need to insert the new blocks
	// at the right place and update the branches to PrologueMBB.
	assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");

	// HiPE-specific values
	NamedMDNode *HiPELiteralsMD = MF.getMMI().getModule()
	->getNamedMetadata("hipe.literals");
	if (!HiPELiteralsMD)
	report_fatal_error(
	"Can't generate HiPE prologue without runtime parameters");
	const unsigned HipeLeafWords
	= getHiPELiteral(HiPELiteralsMD,
	Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS");
	const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
	const unsigned Guaranteed = HipeLeafWords * SlotSize;
	unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs ?
	MF.getFunction().arg_size() - CCRegisteredArgs : 0;
	unsigned MaxStack = MFI.getStackSize() + CallerStkArity*SlotSize + SlotSize;

	assert(STI.isTargetLinux() &&
	"HiPE prologue is only supported on Linux operating systems.");

	// Compute the largest caller's frame that is needed to fit the callees'
	// frames. This 'MaxStack' is computed from:
	//
	// a) the fixed frame size, which is the space needed for all spilled temps,
	// b) outgoing on-stack parameter areas, and
	// c) the minimum stack space this function needs to make available for the
	// functions it calls (a tunable ABI property).
	if (MFI.hasCalls()) {
	unsigned MoreStackForCalls = 0;

	for (auto &MBB : MF) {
	for (auto &MI : MBB) {
	if (!MI.isCall())
	continue;

	// Get callee operand.
	const MachineOperand &MO = MI.getOperand(0);

	// Only take account of global function calls (no closures etc.).
	if (!MO.isGlobal())
	continue;

	const Function *F = dyn_cast<Function>(MO.getGlobal());
	if (!F)
	continue;

	// Do not update 'MaxStack' for primitive and built-in functions
	// (encoded with names either starting with "erlang."/"bif_" or not
	// having a ".", such as a simple <Module>.<Function>.<Arity>, or an
	// "_", such as the BIF "suspend_0") as they are executed on another
	// stack.
	if (F->getName().find("erlang.") != StringRef::npos \|\|
	F->getName().find("bif_") != StringRef::npos \|\|
	F->getName().find_first_of("._") == StringRef::npos)
	continue;

	unsigned CalleeStkArity =
	F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
	if (HipeLeafWords - 1 > CalleeStkArity)
	MoreStackForCalls = std::max(MoreStackForCalls,
	(HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
	}
	}
	MaxStack += MoreStackForCalls;
	}

	// If the stack frame needed is larger than the guaranteed then runtime checks
	// and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
	if (MaxStack > Guaranteed) {
	MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
	MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();

	for (const auto &LI : PrologueMBB.liveins()) {
	stackCheckMBB->addLiveIn(LI);
	incStackMBB->addLiveIn(LI);
	}

	MF.push_front(incStackMBB);
	MF.push_front(stackCheckMBB);

	unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
	unsigned LEAop, CMPop, CALLop;
	SPLimitOffset = getHiPELiteral(HiPELiteralsMD, "P_NSP_LIMIT");
	if (Is64Bit) {
	SPReg = X86::RSP;
	PReg = X86::RBP;
	LEAop = X86::LEA64r;
	CMPop = X86::CMP64rm;
	CALLop = X86::CALL64pcrel32;
	} else {
	SPReg = X86::ESP;
	PReg = X86::EBP;
	LEAop = X86::LEA32r;
	CMPop = X86::CMP32rm;
	CALLop = X86::CALLpcrel32;
	}

	ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
	assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
	"HiPE prologue scratch register is live-in");

	// Create new MBB for StackCheck:
	addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
	SPReg, false, -MaxStack);
	// SPLimitOffset is in a fixed heap location (pointed by BP).
	addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
	.addReg(ScratchReg), PReg, false, SPLimitOffset);
	BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&PrologueMBB);

	// Create new MBB for IncStack:
	BuildMI(incStackMBB, DL, TII.get(CALLop)).
	addExternalSymbol("inc_stack_0");
	addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
	SPReg, false, -MaxStack);
	addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
	.addReg(ScratchReg), PReg, false, SPLimitOffset);
	BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB);

	stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
	stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
	incStackMBB->addSuccessor(&PrologueMBB, {99, 100});
	incStackMBB->addSuccessor(incStackMBB, {1, 100});
	}
	#ifdef EXPENSIVE_CHECKS
	MF.verify();
	#endif
	}

	bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL,
	int Offset) const {

	if (Offset <= 0)
	return false;

	if (Offset % SlotSize)
	return false;

	int NumPops = Offset / SlotSize;
	// This is only worth it if we have at most 2 pops.
	if (NumPops != 1 && NumPops != 2)
	return false;

	// Handle only the trivial case where the adjustment directly follows
	// a call. This is the most common one, anyway.
	if (MBBI == MBB.begin())
	return false;
	MachineBasicBlock::iterator Prev = std::prev(MBBI);
	if (!Prev->isCall() \|\| !Prev->getOperand(1).isRegMask())
	return false;

	unsigned Regs[2];
	unsigned FoundRegs = 0;

	auto &MRI = MBB.getParent()->getRegInfo();
	auto RegMask = Prev->getOperand(1);

	auto &RegClass =
	Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
	// Try to find up to NumPops free registers.
	for (auto Candidate : RegClass) {

	// Poor man's liveness:
	// Since we're immediately after a call, any register that is clobbered
	// by the call and not defined by it can be considered dead.
	if (!RegMask.clobbersPhysReg(Candidate))
	continue;

	// Don't clobber reserved registers
	if (MRI.isReserved(Candidate))
	continue;

	bool IsDef = false;
	for (const MachineOperand &MO : Prev->implicit_operands()) {
	if (MO.isReg() && MO.isDef() &&
	TRI->isSuperOrSubRegisterEq(MO.getReg(), Candidate)) {
	IsDef = true;
	break;
	}
	}

	if (IsDef)
	continue;

	Regs[FoundRegs++] = Candidate;
	if (FoundRegs == (unsigned)NumPops)
	break;
	}

	if (FoundRegs == 0)
	return false;

	// If we found only one free register, but need two, reuse the same one twice.
	while (FoundRegs < (unsigned)NumPops)
	Regs[FoundRegs++] = Regs[0];

	for (int i = 0; i < NumPops; ++i)
	BuildMI(MBB, MBBI, DL,
	TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]);

	return true;
	}

	MachineBasicBlock::iterator X86FrameLowering::
	eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I) const {
	bool reserveCallFrame = hasReservedCallFrame(MF);
	unsigned Opcode = I->getOpcode();
	bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
	DebugLoc DL = I->getDebugLoc();
	uint64_t Amount = !reserveCallFrame ? TII.getFrameSize(*I) : 0;
	uint64_t InternalAmt = (isDestroy \|\| Amount) ? TII.getFrameAdjustment(*I) : 0;
	I = MBB.erase(I);
	auto InsertPos = skipDebugInstructionsForward(I, MBB.end());

	if (!reserveCallFrame) {
	// If the stack pointer can be changed after prologue, turn the
	// adjcallstackup instruction into a 'sub ESP, <amt>' and the
	// adjcallstackdown instruction into 'add ESP, <amt>'

	// We need to keep the stack aligned properly. To do this, we round the
	// amount of space needed for the outgoing arguments up to the next
	// alignment boundary.
	unsigned StackAlign = getStackAlignment();
	Amount = alignTo(Amount, StackAlign);

	MachineModuleInfo &MMI = MF.getMMI();
	const Function &F = MF.getFunction();
	bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
	bool DwarfCFI = !WindowsCFI &&
	(MMI.hasDebugInfo() \|\| F.needsUnwindTableEntry());

	// If we have any exception handlers in this function, and we adjust
	// the SP before calls, we may need to indicate this to the unwinder
	// using GNU_ARGS_SIZE. Note that this may be necessary even when
	// Amount == 0, because the preceding function may have set a non-0
	// GNU_ARGS_SIZE.
	// TODO: We don't need to reset this between subsequent functions,
	// if it didn't change.
	bool HasDwarfEHHandlers = !WindowsCFI && !MF.getLandingPads().empty();

	if (HasDwarfEHHandlers && !isDestroy &&
	MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
	BuildCFI(MBB, InsertPos, DL,
	MCCFIInstruction::createGnuArgsSize(nullptr, Amount));

	if (Amount == 0)
	return I;

	// Factor out the amount that gets handled inside the sequence
	// (Pushes of argument for frame setup, callee pops for frame destroy)
	Amount -= InternalAmt;

	// TODO: This is needed only if we require precise CFA.
	// If this is a callee-pop calling convention, emit a CFA adjust for
	// the amount the callee popped.
	if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
	BuildCFI(MBB, InsertPos, DL,
	MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));

	// Add Amount to SP to destroy a frame, or subtract to setup.
	int64_t StackAdjustment = isDestroy ? Amount : -Amount;
	int64_t CfaAdjustment = -StackAdjustment;

	if (StackAdjustment) {
	// Merge with any previous or following adjustment instruction. Note: the
	// instructions merged with here do not have CFI, so their stack
	// adjustments do not feed into CfaAdjustment.
	StackAdjustment += mergeSPUpdates(MBB, InsertPos, true);
	StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);

	if (StackAdjustment) {
	if (!(F.optForMinSize() &&
	adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment)))
	BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment,
	/InEpilogue=/false);
	}
	}

	if (DwarfCFI && !hasFP(MF)) {
	// If we don't have FP, but need to generate unwind information,
	// we need to set the correct CFA offset after the stack adjustment.
	// How much we adjust the CFA offset depends on whether we're emitting
	// CFI only for EH purposes or for debugging. EH only requires the CFA
	// offset to be correct at each call site, while for debugging we want
	// it to be more precise.

	// TODO: When not using precise CFA, we also need to adjust for the
	// InternalAmt here.
	if (CfaAdjustment) {
	BuildCFI(MBB, InsertPos, DL,
	MCCFIInstruction::createAdjustCfaOffset(nullptr,
	CfaAdjustment));
	}
	}

	return I;
	}

	if (isDestroy && InternalAmt) {
	// If we are performing frame pointer elimination and if the callee pops
	// something off the stack pointer, add it back. We do this until we have
	// more advanced stack pointer tracking ability.
	// We are not tracking the stack pointer adjustment by the callee, so make
	// sure we restore the stack pointer immediately after the call, there may
	// be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
	MachineBasicBlock::iterator CI = I;
	MachineBasicBlock::iterator B = MBB.begin();
	while (CI != B && !std::prev(CI)->isCall())
	--CI;
	BuildStackAdjustment(MBB, CI, DL, -InternalAmt, /InEpilogue=/false);
	}

	return I;
	}

	bool X86FrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
	assert(MBB.getParent() && "Block is not attached to a function!");
	const MachineFunction &MF = *MBB.getParent();
	return !TRI->needsStackRealignment(MF) \|\| !MBB.isLiveIn(X86::EFLAGS);
	}

	bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
	assert(MBB.getParent() && "Block is not attached to a function!");

	// Win64 has strict requirements in terms of epilogue and we are
	// not taking a chance at messing with them.
	// I.e., unless this block is already an exit block, we can't use
	// it as an epilogue.
	if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())
	return false;

	if (canUseLEAForSPInEpilogue(*MBB.getParent()))
	return true;

	// If we cannot use LEA to adjust SP, we may need to use ADD, which
	// clobbers the EFLAGS. Check that we do not need to preserve it,
	// otherwise, conservatively assume this is not
	// safe to insert the epilogue here.
	return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
	}

	bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
	// If we may need to emit frameless compact unwind information, give
	// up as this is currently broken: PR25614.
	return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) \|\| hasFP(MF)) &&
	// The lowering of segmented stack and HiPE only support entry blocks
	// as prologue blocks: PR26107.
	// This limitation may be lifted if we fix:
	// - adjustForSegmentedStacks
	// - adjustForHiPEPrologue
	MF.getFunction().getCallingConv() != CallingConv::HiPE &&
	!MF.shouldSplitStack();
	}

	MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, bool RestoreSP) const {
	assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");
	assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");
	assert(STI.is32Bit() && !Uses64BitFramePtr &&
	"restoring EBP/ESI on non-32-bit target");

	MachineFunction &MF = *MBB.getParent();
	unsigned FramePtr = TRI->getFrameRegister(MF);
	unsigned BasePtr = TRI->getBaseRegister();
	WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	MachineFrameInfo &MFI = MF.getFrameInfo();

	// FIXME: Don't set FrameSetup flag in catchret case.

	int FI = FuncInfo.EHRegNodeFrameIndex;
	int EHRegSize = MFI.getObjectSize(FI);

	if (RestoreSP) {
	// MOV32rm -EHRegSize(%ebp), %esp
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP),
	X86::EBP, true, -EHRegSize)
	.setMIFlag(MachineInstr::FrameSetup);
	}

	unsigned UsedReg;
	int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg);
	int EndOffset = -EHRegOffset - EHRegSize;
	FuncInfo.EHRegNodeEndOffset = EndOffset;

	if (UsedReg == FramePtr) {
	// ADD $offset, %ebp
	unsigned ADDri = getADDriOpcode(false, EndOffset);
	BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr)
	.addReg(FramePtr)
	.addImm(EndOffset)
	.setMIFlag(MachineInstr::FrameSetup)
	->getOperand(3)
	.setIsDead();
	assert(EndOffset >= 0 &&
	"end of registration object above normal EBP position!");
	} else if (UsedReg == BasePtr) {
	// LEA offset(%ebp), %esi
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr),
	FramePtr, false, EndOffset)
	.setMIFlag(MachineInstr::FrameSetup);
	// MOV32rm SavedEBPOffset(%esi), %ebp
	assert(X86FI->getHasSEHFramePtrSave());
	int Offset =
	getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
	assert(UsedReg == BasePtr);
	addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
	UsedReg, true, Offset)
	.setMIFlag(MachineInstr::FrameSetup);
	} else {
	llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr");
	}
	return MBBI;
	}

	namespace {
	// Struct used by orderFrameObjects to help sort the stack objects.
	struct X86FrameSortingObject {
	bool IsValid = false; // true if we care about this Object.
	unsigned ObjectIndex = 0; // Index of Object into MFI list.
	unsigned ObjectSize = 0; // Size of Object in bytes.
	unsigned ObjectAlignment = 1; // Alignment of Object in bytes.
	unsigned ObjectNumUses = 0; // Object static number of uses.
	};

	// The comparison function we use for std::sort to order our local
	// stack symbols. The current algorithm is to use an estimated
	// "density". This takes into consideration the size and number of
	// uses each object has in order to roughly minimize code size.
	// So, for example, an object of size 16B that is referenced 5 times
	// will get higher priority than 4 4B objects referenced 1 time each.
	// It's not perfect and we may be able to squeeze a few more bytes out of
	// it (for example : 0(esp) requires fewer bytes, symbols allocated at the
	// fringe end can have special consideration, given their size is less
	// important, etc.), but the algorithmic complexity grows too much to be
	// worth the extra gains we get. This gets us pretty close.
	// The final order leaves us with objects with highest priority going
	// at the end of our list.
	struct X86FrameSortingComparator {
	inline bool operator()(const X86FrameSortingObject &A,
	const X86FrameSortingObject &B) {
	uint64_t DensityAScaled, DensityBScaled;

	// For consistency in our comparison, all invalid objects are placed
	// at the end. This also allows us to stop walking when we hit the
	// first invalid item after it's all sorted.
	if (!A.IsValid)
	return false;
	if (!B.IsValid)
	return true;

	// The density is calculated by doing :
	// (double)DensityA = A.ObjectNumUses / A.ObjectSize
	// (double)DensityB = B.ObjectNumUses / B.ObjectSize
	// Since this approach may cause inconsistencies in
	// the floating point <, >, == comparisons, depending on the floating
	// point model with which the compiler was built, we're going
	// to scale both sides by multiplying with
	// A.ObjectSize * B.ObjectSize. This ends up factoring away
	// the division and, with it, the need for any floating point
	// arithmetic.
	DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) *
	static_cast<uint64_t>(B.ObjectSize);
	DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) *
	static_cast<uint64_t>(A.ObjectSize);

	// If the two densities are equal, prioritize highest alignment
	// objects. This allows for similar alignment objects
	// to be packed together (given the same density).
	// There's room for improvement here, also, since we can pack
	// similar alignment (different density) objects next to each
	// other to save padding. This will also require further
	// complexity/iterations, and the overall gain isn't worth it,
	// in general. Something to keep in mind, though.
	if (DensityAScaled == DensityBScaled)
	return A.ObjectAlignment < B.ObjectAlignment;

	return DensityAScaled < DensityBScaled;
	}
	};
	} // namespace

	// Order the symbols in the local stack.
	// We want to place the local stack objects in some sort of sensible order.
	// The heuristic we use is to try and pack them according to static number
	// of uses and size of object in order to minimize code size.
	void X86FrameLowering::orderFrameObjects(
	const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();

	// Don't waste time if there's nothing to do.
	if (ObjectsToAllocate.empty())
	return;

	// Create an array of all MFI objects. We won't need all of these
	// objects, but we're going to create a full array of them to make
	// it easier to index into when we're counting "uses" down below.
	// We want to be able to easily/cheaply access an object by simply
	// indexing into it, instead of having to search for it every time.
	std::vector<X86FrameSortingObject> SortingObjects(MFI.getObjectIndexEnd());

	// Walk the objects we care about and mark them as such in our working
	// struct.
	for (auto &Obj : ObjectsToAllocate) {
	SortingObjects[Obj].IsValid = true;
	SortingObjects[Obj].ObjectIndex = Obj;
	SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlignment(Obj);
	// Set the size.
	int ObjectSize = MFI.getObjectSize(Obj);
	if (ObjectSize == 0)
	// Variable size. Just use 4.
	SortingObjects[Obj].ObjectSize = 4;
	else
	SortingObjects[Obj].ObjectSize = ObjectSize;
	}

	// Count the number of uses for each object.
	for (auto &MBB : MF) {
	for (auto &MI : MBB) {
	if (MI.isDebugValue())
	continue;
	for (const MachineOperand &MO : MI.operands()) {
	// Check to see if it's a local stack symbol.
	if (!MO.isFI())
	continue;
	int Index = MO.getIndex();
	// Check to see if it falls within our range, and is tagged
	// to require ordering.
	if (Index >= 0 && Index < MFI.getObjectIndexEnd() &&
	SortingObjects[Index].IsValid)
	SortingObjects[Index].ObjectNumUses++;
	}
	}
	}

	// Sort the objects using X86FrameSortingAlgorithm (see its comment for
	// info).
	std::stable_sort(SortingObjects.begin(), SortingObjects.end(),
	X86FrameSortingComparator());

	// Now modify the original list to represent the final order that
	// we want. The order will depend on whether we're going to access them
	// from the stack pointer or the frame pointer. For SP, the list should
	// end up with the END containing objects that we want with smaller offsets.
	// For FP, it should be flipped.
	int i = 0;
	for (auto &Obj : SortingObjects) {
	// All invalid items are sorted at the end, so it's safe to stop.
	if (!Obj.IsValid)
	break;
	ObjectsToAllocate[i++] = Obj.ObjectIndex;
	}

	// Flip it if we're accessing off of the FP.
	if (!TRI->needsStackRealignment(MF) && hasFP(MF))
	std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end());
	}


	unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
	// RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
	unsigned Offset = 16;
	// RBP is immediately pushed.
	Offset += SlotSize;
	// All callee-saved registers are then pushed.
	Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
	// Every funclet allocates enough stack space for the largest outgoing call.
	Offset += getWinEHFuncletFrameSize(MF);
	return Offset;
	}

	void X86FrameLowering::processFunctionBeforeFrameFinalized(
	MachineFunction &MF, RegScavenger *RS) const {
	// Mark the function as not having WinCFI. We will set it back to true in
	// emitPrologue if it gets called and emits CFI.
	MF.setHasWinCFI(false);

	// If this function isn't doing Win64-style C++ EH, we don't need to do
	// anything.
	const Function &F = MF.getFunction();
	if (!STI.is64Bit() \|\| !MF.hasEHFunclets() \|\|
	classifyEHPersonality(F.getPersonalityFn()) != EHPersonality::MSVC_CXX)
	return;

	// Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
	// relative to RSP after the prologue. Find the offset of the last fixed
	// object, so that we can allocate a slot immediately following it. If there
	// were no fixed objects, use offset -SlotSize, which is immediately after the
	// return address. Fixed objects have negative frame indices.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
	int64_t MinFixedObjOffset = -SlotSize;
	for (int I = MFI.getObjectIndexBegin(); I < 0; ++I)
	MinFixedObjOffset = std::min(MinFixedObjOffset, MFI.getObjectOffset(I));

	for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
	for (WinEHHandlerType &H : TBME.HandlerArray) {
	int FrameIndex = H.CatchObj.FrameIndex;
	if (FrameIndex != INT_MAX) {
	// Ensure alignment.
	unsigned Align = MFI.getObjectAlignment(FrameIndex);
	MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;
	MinFixedObjOffset -= MFI.getObjectSize(FrameIndex);
	MFI.setObjectOffset(FrameIndex, MinFixedObjOffset);
	}
	}
	}

	// Ensure alignment.
	MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8;
	int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
	int UnwindHelpFI =
	MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /Immutable=/false);
	EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;

	// Store -2 into UnwindHelp on function entry. We have to scan forwards past
	// other frame setup instructions.
	MachineBasicBlock &MBB = MF.front();
	auto MBBI = MBB.begin();
	while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
	++MBBI;

	DebugLoc DL = MBB.findDebugLoc(MBBI);
	addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
	UnwindHelpFI)
	.addImm(-2);
	}
	Index: head/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (revision 328817)
	@@ -1,3108 +1,3108 @@
	//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines a DAG pattern matching instruction selector for X86,
	// converting from a legalized dag to a X86 dag.
	//
	//===----------------------------------------------------------------------===//

	#include "X86.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86RegisterInfo.h"
	#include "X86Subtarget.h"
	#include "X86TargetMachine.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/SelectionDAGISel.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/Type.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <stdint.h>
	using namespace llvm;

	#define DEBUG_TYPE "x86-isel"

	STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");

	//===----------------------------------------------------------------------===//
	// Pattern Matcher Implementation
	//===----------------------------------------------------------------------===//

	namespace {
	/// This corresponds to X86AddressMode, but uses SDValue's instead of register
	/// numbers for the leaves of the matched tree.
	struct X86ISelAddressMode {
	enum {
	RegBase,
	FrameIndexBase
	} BaseType;

	// This is really a union, discriminated by BaseType!
	SDValue Base_Reg;
	int Base_FrameIndex;

	unsigned Scale;
	SDValue IndexReg;
	int32_t Disp;
	SDValue Segment;
	const GlobalValue *GV;
	const Constant *CP;
	const BlockAddress *BlockAddr;
	const char *ES;
	MCSymbol *MCSym;
	int JT;
	unsigned Align; // CP alignment.
	unsigned char SymbolFlags; // X86II::MO_*

	X86ISelAddressMode()
	: BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
	Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
	MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}

	bool hasSymbolicDisplacement() const {
	return GV != nullptr \|\| CP != nullptr \|\| ES != nullptr \|\|
	MCSym != nullptr \|\| JT != -1 \|\| BlockAddr != nullptr;
	}

	bool hasBaseOrIndexReg() const {
	return BaseType == FrameIndexBase \|\|
	IndexReg.getNode() != nullptr \|\| Base_Reg.getNode() != nullptr;
	}

	/// Return true if this addressing mode is already RIP-relative.
	bool isRIPRelative() const {
	if (BaseType != RegBase) return false;
	if (RegisterSDNode *RegNode =
	dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
	return RegNode->getReg() == X86::RIP;
	return false;
	}

	void setBaseReg(SDValue Reg) {
	BaseType = RegBase;
	Base_Reg = Reg;
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	void dump() {
	dbgs() << "X86ISelAddressMode " << this << '\n';
	dbgs() << "Base_Reg ";
	if (Base_Reg.getNode())
	Base_Reg.getNode()->dump();
	else
	dbgs() << "nul\n";
	if (BaseType == FrameIndexBase)
	dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
	dbgs() << " Scale " << Scale << '\n'
	<< "IndexReg ";
	if (IndexReg.getNode())
	IndexReg.getNode()->dump();
	else
	dbgs() << "nul\n";
	dbgs() << " Disp " << Disp << '\n'
	<< "GV ";
	if (GV)
	GV->dump();
	else
	dbgs() << "nul";
	dbgs() << " CP ";
	if (CP)
	CP->dump();
	else
	dbgs() << "nul";
	dbgs() << '\n'
	<< "ES ";
	if (ES)
	dbgs() << ES;
	else
	dbgs() << "nul";
	dbgs() << " MCSym ";
	if (MCSym)
	dbgs() << MCSym;
	else
	dbgs() << "nul";
	dbgs() << " JT" << JT << " Align" << Align << '\n';
	}
	#endif
	};
	}

	namespace {
	//===--------------------------------------------------------------------===//
	/// ISel - X86-specific code to select X86 machine instructions for
	/// SelectionDAG operations.
	///
	class X86DAGToDAGISel final : public SelectionDAGISel {
	/// Keep a pointer to the X86Subtarget around so that we can
	/// make the right decision when generating code for different targets.
	const X86Subtarget *Subtarget;

	/// If true, selector should try to optimize for code size instead of
	/// performance.
	bool OptForSize;

	/// If true, selector should try to optimize for minimum code size.
	bool OptForMinSize;

	public:
	explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
	: SelectionDAGISel(tm, OptLevel), OptForSize(false),
	OptForMinSize(false) {}

	StringRef getPassName() const override {
	return "X86 DAG->DAG Instruction Selection";
	}

	bool runOnMachineFunction(MachineFunction &MF) override {
	// Reset the subtarget each time through.
	Subtarget = &MF.getSubtarget<X86Subtarget>();
	SelectionDAGISel::runOnMachineFunction(MF);
	return true;
	}

	void EmitFunctionEntryCode() override;

	bool IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const override;

	void PreprocessISelDAG() override;

	// Include the pieces autogenerated from the target description.
	#include "X86GenDAGISel.inc"

	private:
	void Select(SDNode *N) override;

	bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
	bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
	bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
	bool matchAddress(SDValue N, X86ISelAddressMode &AM);
	bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
	bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
	bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
	unsigned Depth);
	bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
	bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectMOV64Imm32(SDValue N, SDValue &Imm);
	bool selectLEAAddr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectLEA64_32Addr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectTLSADDRAddr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectScalarSSELoad(SDNode *Root, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment,
	SDValue &NodeWithChain);
	bool selectRelocImm(SDValue N, SDValue &Op);

	bool tryFoldLoad(SDNode Root, SDNode P, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment);

	// Convience method where P is also root.
	bool tryFoldLoad(SDNode *P, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment) {
	return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
	}

	/// Implement addressing mode selection for inline asm expressions.
	bool SelectInlineAsmMemoryOperand(const SDValue &Op,
	unsigned ConstraintID,
	std::vector<SDValue> &OutOps) override;

	void emitSpecialCodeForMain();

	inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment) {
	Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
	? CurDAG->getTargetFrameIndex(
	AM.Base_FrameIndex,
	TLI->getPointerTy(CurDAG->getDataLayout()))
	: AM.Base_Reg;
	Scale = getI8Imm(AM.Scale, DL);
	Index = AM.IndexReg;
	// These are 32-bit even in 64-bit mode since RIP-relative offset
	// is 32-bit.
	if (AM.GV)
	Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
	MVT::i32, AM.Disp,
	AM.SymbolFlags);
	else if (AM.CP)
	Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
	AM.Align, AM.Disp, AM.SymbolFlags);
	else if (AM.ES) {
	assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
	Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
	} else if (AM.MCSym) {
	assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
	assert(AM.SymbolFlags == 0 && "oo");
	Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
	} else if (AM.JT != -1) {
	assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
	Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
	} else if (AM.BlockAddr)
	Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
	AM.SymbolFlags);
	else
	Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);

	if (AM.Segment.getNode())
	Segment = AM.Segment;
	else
	Segment = CurDAG->getRegister(0, MVT::i32);
	}

	// Utility function to determine whether we should avoid selecting
	// immediate forms of instructions for better code size or not.
	// At a high level, we'd like to avoid such instructions when
	// we have similar constants used within the same basic block
	// that can be kept in a register.
	//
	bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
	uint32_t UseCount = 0;

	// Do not want to hoist if we're not optimizing for size.
	// TODO: We'd like to remove this restriction.
	// See the comment in X86InstrInfo.td for more info.
	if (!OptForSize)
	return false;

	// Walk all the users of the immediate.
	for (SDNode::use_iterator UI = N->use_begin(),
	UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {

	SDNode User = UI;

	// This user is already selected. Count it as a legitimate use and
	// move on.
	if (User->isMachineOpcode()) {
	UseCount++;
	continue;
	}

	// We want to count stores of immediates as real uses.
	if (User->getOpcode() == ISD::STORE &&
	User->getOperand(1).getNode() == N) {
	UseCount++;
	continue;
	}

	// We don't currently match users that have > 2 operands (except
	// for stores, which are handled above)
	// Those instruction won't match in ISEL, for now, and would
	// be counted incorrectly.
	// This may change in the future as we add additional instruction
	// types.
	if (User->getNumOperands() != 2)
	continue;

	// Immediates that are used for offsets as part of stack
	// manipulation should be left alone. These are typically
	// used to indicate SP offsets for argument passing and
	// will get pulled into stores/pushes (implicitly).
	if (User->getOpcode() == X86ISD::ADD \|\|
	User->getOpcode() == ISD::ADD \|\|
	User->getOpcode() == X86ISD::SUB \|\|
	User->getOpcode() == ISD::SUB) {

	// Find the other operand of the add/sub.
	SDValue OtherOp = User->getOperand(0);
	if (OtherOp.getNode() == N)
	OtherOp = User->getOperand(1);

	// Don't count if the other operand is SP.
	RegisterSDNode *RegNode;
	if (OtherOp->getOpcode() == ISD::CopyFromReg &&
	(RegNode = dyn_cast_or_null<RegisterSDNode>(
	OtherOp->getOperand(1).getNode())))
	if ((RegNode->getReg() == X86::ESP) \|\|
	(RegNode->getReg() == X86::RSP))
	continue;
	}

	// ... otherwise, count this and move on.
	UseCount++;
	}

	// If we have more than 1 use, then recommend for hoisting.
	return (UseCount > 1);
	}

	/// Return a target constant with the specified value of type i8.
	inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
	return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
	}

	/// Return a target constant with the specified value, of type i32.
	inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
	return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
	}

	SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
	const SDLoc &DL) {
	assert((VecWidth == 128 \|\| VecWidth == 256) && "Unexpected vector width");
	uint64_t Index = N->getConstantOperandVal(1);
	MVT VecVT = N->getOperand(0).getSimpleValueType();
	return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
	}

	SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
	const SDLoc &DL) {
	assert((VecWidth == 128 \|\| VecWidth == 256) && "Unexpected vector width");
	uint64_t Index = N->getConstantOperandVal(2);
	MVT VecVT = N->getSimpleValueType(0);
	return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
	}

	/// Return an SDNode that returns the value of the global base register.
	/// Output instructions required to initialize the global base register,
	/// if necessary.
	SDNode *getGlobalBaseReg();

	/// Return a reference to the TargetMachine, casted to the target-specific
	/// type.
	const X86TargetMachine &getTargetMachine() const {
	return static_cast<const X86TargetMachine &>(TM);
	}

	/// Return a reference to the TargetInstrInfo, casted to the target-specific
	/// type.
	const X86InstrInfo *getInstrInfo() const {
	return Subtarget->getInstrInfo();
	}

	/// \brief Address-mode matching performs shift-of-and to and-of-shift
	/// reassociation in order to expose more scaled addressing
	/// opportunities.
	bool ComplexPatternFuncMutatesDAG() const override {
	return true;
	}

	bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;

	/// Returns whether this is a relocatable immediate in the range
	/// [-2^Width .. 2^Width-1].
	template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
	if (auto *CN = dyn_cast<ConstantSDNode>(N))
	return isInt<Width>(CN->getSExtValue());
	return isSExtAbsoluteSymbolRef(Width, N);
	}

	// Indicates we should prefer to use a non-temporal load for this load.
	bool useNonTemporalLoad(LoadSDNode *N) const {
	if (!N->isNonTemporal())
	return false;

	unsigned StoreSize = N->getMemoryVT().getStoreSize();

	if (N->getAlignment() < StoreSize)
	return false;

	switch (StoreSize) {
	default: llvm_unreachable("Unsupported store size");
	case 16:
	return Subtarget->hasSSE41();
	case 32:
	return Subtarget->hasAVX2();
	case 64:
	return Subtarget->hasAVX512();
	}
	}

	bool foldLoadStoreIntoMemOperand(SDNode *Node);

	bool matchBEXTRFromAnd(SDNode *Node);

	bool isMaskZeroExtended(SDNode *N) const;
	};
	}


	// Returns true if this masked compare can be implemented legally with this
	// type.
	static bool isLegalMaskCompare(SDNode N, const X86Subtarget Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == X86ISD::PCMPEQM \|\| Opcode == X86ISD::PCMPGTM \|\|
	Opcode == X86ISD::CMPM \|\| Opcode == X86ISD::TESTM \|\|
	Opcode == X86ISD::TESTNM \|\| Opcode == X86ISD::CMPMU \|\|
	Opcode == X86ISD::CMPM_RND) {
	// We can get 256-bit 8 element types here without VLX being enabled. When
	// this happens we will use 512-bit operations and the mask will not be
	// zero extended.
	EVT OpVT = N->getOperand(0).getValueType();
	if (OpVT == MVT::v8i32 \|\| OpVT == MVT::v8f32)
	return Subtarget->hasVLX();

	return true;
	}

	return false;
	}

	// Returns true if we can assume the writer of the mask has zero extended it
	// for us.
	bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
	// If this is an AND, check if we have a compare on either side. As long as
	// one side guarantees the mask is zero extended, the AND will preserve those
	// zeros.
	if (N->getOpcode() == ISD::AND)
	return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) \|\|
	isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);

	return isLegalMaskCompare(N, Subtarget);
	}

	bool
	X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const {
	if (OptLevel == CodeGenOpt::None) return false;

	if (!N.hasOneUse())
	return false;

	if (N.getOpcode() != ISD::LOAD)
	return true;

	// If N is a load, do additional profitability checks.
	if (U == Root) {
	switch (U->getOpcode()) {
	default: break;
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::AND:
	case X86ISD::XOR:
	case X86ISD::OR:
	case ISD::ADD:
	case ISD::ADDCARRY:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR: {
	SDValue Op1 = U->getOperand(1);

	// If the other operand is a 8-bit immediate we should fold the immediate
	// instead. This reduces code size.
	// e.g.
	// movl 4(%esp), %eax
	// addl $4, %eax
	// vs.
	// movl $4, %eax
	// addl 4(%esp), %eax
	// The former is 2 bytes shorter. In case where the increment is 1, then
	// the saving can be 4 bytes (by using incl %eax).
	if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
	if (Imm->getAPIntValue().isSignedIntN(8))
	return false;

	// If the other operand is a TLS address, we should fold it instead.
	// This produces
	// movl %gs:0, %eax
	// leal i@NTPOFF(%eax), %eax
	// instead of
	// movl $i@NTPOFF, %eax
	// addl %gs:0, %eax
	// if the block also has an access to a second TLS address this will save
	// a load.
	// FIXME: This is probably also true for non-TLS addresses.
	if (Op1.getOpcode() == X86ISD::Wrapper) {
	SDValue Val = Op1.getOperand(0);
	if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
	return false;
	}
	}
	}
	}

	return true;
	}

	/// Replace the original chain operand of the call with
	/// load's chain operand and move load below the call's chain operand.
	static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
	SDValue Call, SDValue OrigChain) {
	SmallVector<SDValue, 8> Ops;
	SDValue Chain = OrigChain.getOperand(0);
	if (Chain.getNode() == Load.getNode())
	Ops.push_back(Load.getOperand(0));
	else {
	assert(Chain.getOpcode() == ISD::TokenFactor &&
	"Unexpected chain operand");
	for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
	if (Chain.getOperand(i).getNode() == Load.getNode())
	Ops.push_back(Load.getOperand(0));
	else
	Ops.push_back(Chain.getOperand(i));
	SDValue NewChain =
	CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
	Ops.clear();
	Ops.push_back(NewChain);
	}
	Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
	CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
	CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
	Load.getOperand(1), Load.getOperand(2));

	Ops.clear();
	Ops.push_back(SDValue(Load.getNode(), 1));
	Ops.append(Call->op_begin() + 1, Call->op_end());
	CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
	}

	/// Return true if call address is a load and it can be
	/// moved below CALLSEQ_START and the chains leading up to the call.
	/// Return the CALLSEQ_START by reference as a second output.
	/// In the case of a tail call, there isn't a callseq node between the call
	/// chain and the load.
	static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
	// The transformation is somewhat dangerous if the call's chain was glued to
	// the call. After MoveBelowOrigChain the load is moved between the call and
	// the chain, this can create a cycle if the load is not folded. So it is
	// really important that we are sure the load will be folded.
	if (Callee.getNode() == Chain.getNode() \|\| !Callee.hasOneUse())
	return false;
	LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
	if (!LD \|\|
	LD->isVolatile() \|\|
	LD->getAddressingMode() != ISD::UNINDEXED \|\|
	LD->getExtensionType() != ISD::NON_EXTLOAD)
	return false;

	// Now let's find the callseq_start.
	while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
	if (!Chain.hasOneUse())
	return false;
	Chain = Chain.getOperand(0);
	}

	if (!Chain.getNumOperands())
	return false;
	// Since we are not checking for AA here, conservatively abort if the chain
	// writes to memory. It's not safe to move the callee (a load) across a store.
	if (isa<MemSDNode>(Chain.getNode()) &&
	cast<MemSDNode>(Chain.getNode())->writeMem())
	return false;
	if (Chain.getOperand(0).getNode() == Callee.getNode())
	return true;
	if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
	Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
	Callee.getValue(1).hasOneUse())
	return true;
	return false;
	}

	void X86DAGToDAGISel::PreprocessISelDAG() {
	// OptFor[Min]Size are used in pattern predicates that isel is matching.
	OptForSize = MF->getFunction().optForSize();
	OptForMinSize = MF->getFunction().optForMinSize();
	assert((!OptForMinSize \|\| OptForSize) && "OptForMinSize implies OptForSize");

	for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
	E = CurDAG->allnodes_end(); I != E; ) {
	SDNode N = &I++; // Preincrement iterator to avoid invalidation issues.

	if (OptLevel != CodeGenOpt::None &&
	- // Only does this when target favors doesn't favor register indirect
	- // call.
	+ // Only do this when the target can fold the load into the call or
	+ // jmp.
	+ !Subtarget->useRetpoline() &&
	((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) \|\|
	(N->getOpcode() == X86ISD::TC_RETURN &&
	- // Only does this if load can be folded into TC_RETURN.
	(Subtarget->is64Bit() \|\|
	!getTargetMachine().isPositionIndependent())))) {
	/// Also try moving call address load from outside callseq_start to just
	/// before the call to allow it to be folded.
	///
	/// [Load chain]
	/// ^
	/// \|
	/// [Load]
	/// ^ ^
	/// \| \|
	/// / \--
	/// / \|
	///[CALLSEQ_START] \|
	/// ^ \|
	/// \| \|
	/// [LOAD/C2Reg] \|
	/// \| \|
	/// \ /
	/// \ /
	/// [CALL]
	bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
	SDValue Chain = N->getOperand(0);
	SDValue Load = N->getOperand(1);
	if (!isCalleeLoad(Load, Chain, HasCallSeq))
	continue;
	moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
	++NumLoadMoved;
	continue;
	}

	// Lower fpround and fpextend nodes that target the FP stack to be store and
	// load to the stack. This is a gross hack. We would like to simply mark
	// these as being illegal, but when we do that, legalize produces these when
	// it expands calls, then expands these in the same legalize pass. We would
	// like dag combine to be able to hack on these between the call expansion
	// and the node legalization. As such this pass basically does "really
	// late" legalization of these inline with the X86 isel pass.
	// FIXME: This should only happen when not compiled with -O0.
	if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
	continue;

	MVT SrcVT = N->getOperand(0).getSimpleValueType();
	MVT DstVT = N->getSimpleValueType(0);

	// If any of the sources are vectors, no fp stack involved.
	if (SrcVT.isVector() \|\| DstVT.isVector())
	continue;

	// If the source and destination are SSE registers, then this is a legal
	// conversion that should not be lowered.
	const X86TargetLowering *X86Lowering =
	static_cast<const X86TargetLowering *>(TLI);
	bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
	bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
	if (SrcIsSSE && DstIsSSE)
	continue;

	if (!SrcIsSSE && !DstIsSSE) {
	// If this is an FPStack extension, it is a noop.
	if (N->getOpcode() == ISD::FP_EXTEND)
	continue;
	// If this is a value-preserving FPStack truncation, it is a noop.
	if (N->getConstantOperandVal(1))
	continue;
	}

	// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
	// FPStack has extload and truncstore. SSE can fold direct loads into other
	// operations. Based on this, decide what we want to do.
	MVT MemVT;
	if (N->getOpcode() == ISD::FP_ROUND)
	MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
	else
	MemVT = SrcIsSSE ? SrcVT : DstVT;

	SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
	SDLoc dl(N);

	// FIXME: optimize the case where the src/dest is a load or store?
	SDValue Store =
	CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
	MemTmp, MachinePointerInfo(), MemVT);
	SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
	MachinePointerInfo(), MemVT);

	// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
	// extload we created. This will cause general havok on the dag because
	// anything below the conversion could be folded into other existing nodes.
	// To avoid invalidating 'I', back it up to the convert node.
	--I;
	CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);

	// Now that we did that, the node is dead. Increment the iterator to the
	// next node to process, then delete N.
	++I;
	CurDAG->DeleteNode(N);
	}
	}


	/// Emit any code that needs to be executed only in the main function.
	void X86DAGToDAGISel::emitSpecialCodeForMain() {
	if (Subtarget->isTargetCygMing()) {
	TargetLowering::ArgListTy Args;
	auto &DL = CurDAG->getDataLayout();

	TargetLowering::CallLoweringInfo CLI(*CurDAG);
	CLI.setChain(CurDAG->getRoot())
	.setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
	CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
	std::move(Args));
	const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
	std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
	CurDAG->setRoot(Result.second);
	}
	}

	void X86DAGToDAGISel::EmitFunctionEntryCode() {
	// If this is main, emit special code for main.
	const Function &F = MF->getFunction();
	if (F.hasExternalLinkage() && F.getName() == "main")
	emitSpecialCodeForMain();
	}

	static bool isDispSafeForFrameIndex(int64_t Val) {
	// On 64-bit platforms, we can run into an issue where a frame index
	// includes a displacement that, when added to the explicit displacement,
	// will overflow the displacement field. Assuming that the frame index
	// displacement fits into a 31-bit integer (which is only slightly more
	// aggressive than the current fundamental assumption that it fits into
	// a 32-bit integer), a 31-bit disp should always be safe.
	return isInt<31>(Val);
	}

	bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
	X86ISelAddressMode &AM) {
	// Cannot combine ExternalSymbol displacements with integer offsets.
	if (Offset != 0 && (AM.ES \|\| AM.MCSym))
	return true;
	int64_t Val = AM.Disp + Offset;
	CodeModel::Model M = TM.getCodeModel();
	if (Subtarget->is64Bit()) {
	if (!X86::isOffsetSuitableForCodeModel(Val, M,
	AM.hasSymbolicDisplacement()))
	return true;
	// In addition to the checks required for a register base, check that
	// we do not try to use an unsafe Disp with a frame index.
	if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
	!isDispSafeForFrameIndex(Val))
	return true;
	}
	AM.Disp = Val;
	return false;

	}

	bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
	SDValue Address = N->getOperand(1);

	// load gs:0 -> GS segment register.
	// load fs:0 -> FS segment register.
	//
	// This optimization is valid because the GNU TLS model defines that
	// gs:0 (or fs:0 on X86-64) contains its own address.
	// For more information see http://people.redhat.com/drepper/tls.pdf
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
	if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
	(Subtarget->isTargetGlibc() \|\| Subtarget->isTargetAndroid() \|\|
	Subtarget->isTargetFuchsia()))
	switch (N->getPointerInfo().getAddrSpace()) {
	case 256:
	AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
	return false;
	case 257:
	AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
	return false;
	// Address space 258 is not handled here, because it is not used to
	// address TLS areas.
	}

	return true;
	}

	/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
	/// mode. These wrap things that will resolve down into a symbol reference.
	/// If no match is possible, this returns true, otherwise it returns false.
	bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
	// If the addressing mode already has a symbol as the displacement, we can
	// never match another symbol.
	if (AM.hasSymbolicDisplacement())
	return true;

	SDValue N0 = N.getOperand(0);
	CodeModel::Model M = TM.getCodeModel();

	// Handle X86-64 rip-relative addresses. We check this before checking direct
	// folding because RIP is preferable to non-RIP accesses.
	if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
	// Under X86-64 non-small code model, GV (and friends) are 64-bits, so
	// they cannot be folded into immediate fields.
	// FIXME: This can be improved for kernel and other models?
	(M == CodeModel::Small \|\| M == CodeModel::Kernel)) {
	// Base and index reg must be 0 in order to use %rip as base.
	if (AM.hasBaseOrIndexReg())
	return true;
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
	X86ISelAddressMode Backup = AM;
	AM.GV = G->getGlobal();
	AM.SymbolFlags = G->getTargetFlags();
	if (foldOffsetIntoAddress(G->getOffset(), AM)) {
	AM = Backup;
	return true;
	}
	} else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
	X86ISelAddressMode Backup = AM;
	AM.CP = CP->getConstVal();
	AM.Align = CP->getAlignment();
	AM.SymbolFlags = CP->getTargetFlags();
	if (foldOffsetIntoAddress(CP->getOffset(), AM)) {
	AM = Backup;
	return true;
	}
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
	AM.ES = S->getSymbol();
	AM.SymbolFlags = S->getTargetFlags();
	} else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
	AM.MCSym = S->getMCSymbol();
	} else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
	AM.JT = J->getIndex();
	AM.SymbolFlags = J->getTargetFlags();
	} else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
	X86ISelAddressMode Backup = AM;
	AM.BlockAddr = BA->getBlockAddress();
	AM.SymbolFlags = BA->getTargetFlags();
	if (foldOffsetIntoAddress(BA->getOffset(), AM)) {
	AM = Backup;
	return true;
	}
	} else
	llvm_unreachable("Unhandled symbol reference node.");

	if (N.getOpcode() == X86ISD::WrapperRIP)
	AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
	return false;
	}

	// Handle the case when globals fit in our immediate field: This is true for
	// X86-32 always and X86-64 when in -mcmodel=small mode. In 64-bit
	// mode, this only applies to a non-RIP-relative computation.
	if (!Subtarget->is64Bit() \|\|
	M == CodeModel::Small \|\| M == CodeModel::Kernel) {
	assert(N.getOpcode() != X86ISD::WrapperRIP &&
	"RIP-relative addressing already handled");
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
	AM.GV = G->getGlobal();
	AM.Disp += G->getOffset();
	AM.SymbolFlags = G->getTargetFlags();
	} else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
	AM.CP = CP->getConstVal();
	AM.Align = CP->getAlignment();
	AM.Disp += CP->getOffset();
	AM.SymbolFlags = CP->getTargetFlags();
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
	AM.ES = S->getSymbol();
	AM.SymbolFlags = S->getTargetFlags();
	} else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
	AM.MCSym = S->getMCSymbol();
	} else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
	AM.JT = J->getIndex();
	AM.SymbolFlags = J->getTargetFlags();
	} else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
	AM.BlockAddr = BA->getBlockAddress();
	AM.Disp += BA->getOffset();
	AM.SymbolFlags = BA->getTargetFlags();
	} else
	llvm_unreachable("Unhandled symbol reference node.");
	return false;
	}

	return true;
	}

	/// Add the specified node to the specified addressing mode, returning true if
	/// it cannot be done. This just pattern matches for the addressing mode.
	bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
	if (matchAddressRecursively(N, AM, 0))
	return true;

	// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
	// a smaller encoding and avoids a scaled-index.
	if (AM.Scale == 2 &&
	AM.BaseType == X86ISelAddressMode::RegBase &&
	AM.Base_Reg.getNode() == nullptr) {
	AM.Base_Reg = AM.IndexReg;
	AM.Scale = 1;
	}

	// Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
	// because it has a smaller encoding.
	// TODO: Which other code models can use this?
	if (TM.getCodeModel() == CodeModel::Small &&
	Subtarget->is64Bit() &&
	AM.Scale == 1 &&
	AM.BaseType == X86ISelAddressMode::RegBase &&
	AM.Base_Reg.getNode() == nullptr &&
	AM.IndexReg.getNode() == nullptr &&
	AM.SymbolFlags == X86II::MO_NO_FLAG &&
	AM.hasSymbolicDisplacement())
	AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);

	return false;
	}

	bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
	unsigned Depth) {
	// Add an artificial use to this node so that we can keep track of
	// it if it gets CSE'd with a different node.
	HandleSDNode Handle(N);

	X86ISelAddressMode Backup = AM;
	if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
	!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
	return false;
	AM = Backup;

	// Try again after commuting the operands.
	if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
	!matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
	return false;
	AM = Backup;

	// If we couldn't fold both operands into the address at the same time,
	// see if we can just put each operand into a register and fold at least
	// the add.
	if (AM.BaseType == X86ISelAddressMode::RegBase &&
	!AM.Base_Reg.getNode() &&
	!AM.IndexReg.getNode()) {
	N = Handle.getValue();
	AM.Base_Reg = N.getOperand(0);
	AM.IndexReg = N.getOperand(1);
	AM.Scale = 1;
	return false;
	}
	N = Handle.getValue();
	return true;
	}

	// Insert a node into the DAG at least before the Pos node's position. This
	// will reposition the node as needed, and will assign it a node ID that is <=
	// the Pos node's ID. Note that this does not preserve the uniqueness of node
	// IDs! The selection DAG must no longer depend on their uniqueness when this
	// is used.
	static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
	if (N.getNode()->getNodeId() == -1 \|\|
	N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
	DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode());
	N.getNode()->setNodeId(Pos.getNode()->getNodeId());
	}
	}

	// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
	// safe. This allows us to convert the shift and and into an h-register
	// extract and a scaled index. Returns false if the simplification is
	// performed.
	static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
	uint64_t Mask,
	SDValue Shift, SDValue X,
	X86ISelAddressMode &AM) {
	if (Shift.getOpcode() != ISD::SRL \|\|
	!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	!Shift.hasOneUse())
	return true;

	int ScaleLog = 8 - Shift.getConstantOperandVal(1);
	if (ScaleLog <= 0 \|\| ScaleLog >= 4 \|\|
	Mask != (0xffu << ScaleLog))
	return true;

	MVT VT = N.getSimpleValueType();
	SDLoc DL(N);
	SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
	SDValue NewMask = DAG.getConstant(0xff, DL, VT);
	SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
	SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
	SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);

	// Insert the new nodes into the topological ordering. We must do this in
	// a valid topological ordering as nothing is going to go back and re-sort
	// these nodes. We continually insert before 'N' in sequence as this is
	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
	// hierarchy left to express.
	insertDAGNode(DAG, N, Eight);
	insertDAGNode(DAG, N, Srl);
	insertDAGNode(DAG, N, NewMask);
	insertDAGNode(DAG, N, And);
	insertDAGNode(DAG, N, ShlCount);
	insertDAGNode(DAG, N, Shl);
	DAG.ReplaceAllUsesWith(N, Shl);
	AM.IndexReg = And;
	AM.Scale = (1 << ScaleLog);
	return false;
	}

	// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
	// allows us to fold the shift into this addressing mode. Returns false if the
	// transform succeeded.
	static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
	uint64_t Mask,
	SDValue Shift, SDValue X,
	X86ISelAddressMode &AM) {
	if (Shift.getOpcode() != ISD::SHL \|\|
	!isa<ConstantSDNode>(Shift.getOperand(1)))
	return true;

	// Not likely to be profitable if either the AND or SHIFT node has more
	// than one use (unless all uses are for address computation). Besides,
	// isel mechanism requires their node ids to be reused.
	if (!N.hasOneUse() \|\| !Shift.hasOneUse())
	return true;

	// Verify that the shift amount is something we can fold.
	unsigned ShiftAmt = Shift.getConstantOperandVal(1);
	if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
	return true;

	MVT VT = N.getSimpleValueType();
	SDLoc DL(N);
	SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
	SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));

	// Insert the new nodes into the topological ordering. We must do this in
	// a valid topological ordering as nothing is going to go back and re-sort
	// these nodes. We continually insert before 'N' in sequence as this is
	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
	// hierarchy left to express.
	insertDAGNode(DAG, N, NewMask);
	insertDAGNode(DAG, N, NewAnd);
	insertDAGNode(DAG, N, NewShift);
	DAG.ReplaceAllUsesWith(N, NewShift);

	AM.Scale = 1 << ShiftAmt;
	AM.IndexReg = NewAnd;
	return false;
	}

	// Implement some heroics to detect shifts of masked values where the mask can
	// be replaced by extending the shift and undoing that in the addressing mode
	// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
	// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
	// the addressing mode. This results in code such as:
	//
	// int f(short y, int lookup_table) {
	// ...
	// return y + lookup_table[y >> 11];
	// }
	//
	// Turning into:
	// movzwl (%rdi), %eax
	// movl %eax, %ecx
	// shrl $11, %ecx
	// addl (%rsi,%rcx,4), %eax
	//
	// Instead of:
	// movzwl (%rdi), %eax
	// movl %eax, %ecx
	// shrl $9, %ecx
	// andl $124, %rcx
	// addl (%rsi,%rcx), %eax
	//
	// Note that this function assumes the mask is provided as a mask after the
	// value is shifted. The input chain may or may not match that, but computing
	// such a mask is trivial.
	static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
	uint64_t Mask,
	SDValue Shift, SDValue X,
	X86ISelAddressMode &AM) {
	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse() \|\|
	!isa<ConstantSDNode>(Shift.getOperand(1)))
	return true;

	unsigned ShiftAmt = Shift.getConstantOperandVal(1);
	unsigned MaskLZ = countLeadingZeros(Mask);
	unsigned MaskTZ = countTrailingZeros(Mask);

	// The amount of shift we're trying to fit into the addressing mode is taken
	// from the trailing zeros of the mask.
	unsigned AMShiftAmt = MaskTZ;

	// There is nothing we can do here unless the mask is removing some bits.
	// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
	if (AMShiftAmt <= 0 \|\| AMShiftAmt > 3) return true;

	// We also need to ensure that mask is a continuous run of bits.
	if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;

	// Scale the leading zero count down based on the actual size of the value.
	// Also scale it down based on the size of the shift.
	unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
	if (MaskLZ < ScaleDown)
	return true;
	MaskLZ -= ScaleDown;

	// The final check is to ensure that any masked out high bits of X are
	// already known to be zero. Otherwise, the mask has a semantic impact
	// other than masking out a couple of low bits. Unfortunately, because of
	// the mask, zero extensions will be removed from operands in some cases.
	// This code works extra hard to look through extensions because we can
	// replace them with zero extensions cheaply if necessary.
	bool ReplacingAnyExtend = false;
	if (X.getOpcode() == ISD::ANY_EXTEND) {
	unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
	X.getOperand(0).getSimpleValueType().getSizeInBits();
	// Assume that we'll replace the any-extend with a zero-extend, and
	// narrow the search to the extended value.
	X = X.getOperand(0);
	MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
	ReplacingAnyExtend = true;
	}
	APInt MaskedHighBits =
	APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
	KnownBits Known;
	DAG.computeKnownBits(X, Known);
	if (MaskedHighBits != Known.Zero) return true;

	// We've identified a pattern that can be transformed into a single shift
	// and an addressing mode. Make it so.
	MVT VT = N.getSimpleValueType();
	if (ReplacingAnyExtend) {
	assert(X.getValueType() != VT);
	// We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
	SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
	insertDAGNode(DAG, N, NewX);
	X = NewX;
	}
	SDLoc DL(N);
	SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
	SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
	SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
	SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);

	// Insert the new nodes into the topological ordering. We must do this in
	// a valid topological ordering as nothing is going to go back and re-sort
	// these nodes. We continually insert before 'N' in sequence as this is
	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
	// hierarchy left to express.
	insertDAGNode(DAG, N, NewSRLAmt);
	insertDAGNode(DAG, N, NewSRL);
	insertDAGNode(DAG, N, NewSHLAmt);
	insertDAGNode(DAG, N, NewSHL);
	DAG.ReplaceAllUsesWith(N, NewSHL);

	AM.Scale = 1 << AMShiftAmt;
	AM.IndexReg = NewSRL;
	return false;
	}

	bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
	unsigned Depth) {
	SDLoc dl(N);
	DEBUG({
	dbgs() << "MatchAddress: ";
	AM.dump();
	});
	// Limit recursion.
	if (Depth > 5)
	return matchAddressBase(N, AM);

	// If this is already a %rip relative address, we can only merge immediates
	// into it. Instead of handling this in every case, we handle it here.
	// RIP relative addressing: %rip + 32-bit displacement!
	if (AM.isRIPRelative()) {
	// FIXME: JumpTable and ExternalSymbol address currently don't like
	// displacements. It isn't very important, but this should be fixed for
	// consistency.
	if (!(AM.ES \|\| AM.MCSym) && AM.JT != -1)
	return true;

	if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
	if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
	return false;
	return true;
	}

	switch (N.getOpcode()) {
	default: break;
	case ISD::LOCAL_RECOVER: {
	if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
	if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
	// Use the symbol and don't prefix it.
	AM.MCSym = ESNode->getMCSymbol();
	return false;
	}
	break;
	}
	case ISD::Constant: {
	uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
	if (!foldOffsetIntoAddress(Val, AM))
	return false;
	break;
	}

	case X86ISD::Wrapper:
	case X86ISD::WrapperRIP:
	if (!matchWrapper(N, AM))
	return false;
	break;

	case ISD::LOAD:
	if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
	return false;
	break;

	case ISD::FrameIndex:
	if (AM.BaseType == X86ISelAddressMode::RegBase &&
	AM.Base_Reg.getNode() == nullptr &&
	(!Subtarget->is64Bit() \|\| isDispSafeForFrameIndex(AM.Disp))) {
	AM.BaseType = X86ISelAddressMode::FrameIndexBase;
	AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
	return false;
	}
	break;

	case ISD::SHL:
	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != 1)
	break;

	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
	unsigned Val = CN->getZExtValue();
	// Note that we handle x<<1 as (,x,2) rather than (x,x) here so
	// that the base operand remains free for further matching. If
	// the base doesn't end up getting used, a post-processing step
	// in MatchAddress turns (,x,2) into (x,x), which is cheaper.
	if (Val == 1 \|\| Val == 2 \|\| Val == 3) {
	AM.Scale = 1 << Val;
	SDValue ShVal = N.getOperand(0);

	// Okay, we know that we have a scale by now. However, if the scaled
	// value is an add of something and a constant, we can fold the
	// constant into the disp field here.
	if (CurDAG->isBaseWithConstantOffset(ShVal)) {
	AM.IndexReg = ShVal.getOperand(0);
	ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
	uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
	if (!foldOffsetIntoAddress(Disp, AM))
	return false;
	}

	AM.IndexReg = ShVal;
	return false;
	}
	}
	break;

	case ISD::SRL: {
	// Scale must not be used already.
	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != 1) break;

	SDValue And = N.getOperand(0);
	if (And.getOpcode() != ISD::AND) break;
	SDValue X = And.getOperand(0);

	// We only handle up to 64-bit values here as those are what matter for
	// addressing mode optimizations.
	if (X.getSimpleValueType().getSizeInBits() > 64) break;

	// The mask used for the transform is expected to be post-shift, but we
	// found the shift first so just apply the shift to the mask before passing
	// it down.
	if (!isa<ConstantSDNode>(N.getOperand(1)) \|\|
	!isa<ConstantSDNode>(And.getOperand(1)))
	break;
	uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);

	// Try to fold the mask and shift into the scale, and return false if we
	// succeed.
	if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
	return false;
	break;
	}

	case ISD::SMUL_LOHI:
	case ISD::UMUL_LOHI:
	// A mul_lohi where we need the low part can be folded as a plain multiply.
	if (N.getResNo() != 0) break;
	LLVM_FALLTHROUGH;
	case ISD::MUL:
	case X86ISD::MUL_IMM:
	// X[3,5,9] -> X+X[2,4,8]
	if (AM.BaseType == X86ISelAddressMode::RegBase &&
	AM.Base_Reg.getNode() == nullptr &&
	AM.IndexReg.getNode() == nullptr) {
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
	if (CN->getZExtValue() == 3 \|\| CN->getZExtValue() == 5 \|\|
	CN->getZExtValue() == 9) {
	AM.Scale = unsigned(CN->getZExtValue())-1;

	SDValue MulVal = N.getOperand(0);
	SDValue Reg;

	// Okay, we know that we have a scale by now. However, if the scaled
	// value is an add of something and a constant, we can fold the
	// constant into the disp field here.
	if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
	isa<ConstantSDNode>(MulVal.getOperand(1))) {
	Reg = MulVal.getOperand(0);
	ConstantSDNode *AddVal =
	cast<ConstantSDNode>(MulVal.getOperand(1));
	uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
	if (foldOffsetIntoAddress(Disp, AM))
	Reg = N.getOperand(0);
	} else {
	Reg = N.getOperand(0);
	}

	AM.IndexReg = AM.Base_Reg = Reg;
	return false;
	}
	}
	break;

	case ISD::SUB: {
	// Given A-B, if A can be completely folded into the address and
	// the index field with the index field unused, use -B as the index.
	// This is a win if a has multiple parts that can be folded into
	// the address. Also, this saves a mov if the base register has
	// other uses, since it avoids a two-address sub instruction, however
	// it costs an additional mov if the index register has other uses.

	// Add an artificial use to this node so that we can keep track of
	// it if it gets CSE'd with a different node.
	HandleSDNode Handle(N);

	// Test if the LHS of the sub can be folded.
	X86ISelAddressMode Backup = AM;
	if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
	AM = Backup;
	break;
	}
	// Test if the index field is free for use.
	if (AM.IndexReg.getNode() \|\| AM.isRIPRelative()) {
	AM = Backup;
	break;
	}

	int Cost = 0;
	SDValue RHS = Handle.getValue().getOperand(1);
	// If the RHS involves a register with multiple uses, this
	// transformation incurs an extra mov, due to the neg instruction
	// clobbering its operand.
	if (!RHS.getNode()->hasOneUse() \|\|
	RHS.getNode()->getOpcode() == ISD::CopyFromReg \|\|
	RHS.getNode()->getOpcode() == ISD::TRUNCATE \|\|
	RHS.getNode()->getOpcode() == ISD::ANY_EXTEND \|\|
	(RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
	RHS.getOperand(0).getValueType() == MVT::i32))
	++Cost;
	// If the base is a register with multiple uses, this
	// transformation may save a mov.
	// FIXME: Don't rely on DELETED_NODEs.
	if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
	AM.Base_Reg->getOpcode() != ISD::DELETED_NODE &&
	!AM.Base_Reg.getNode()->hasOneUse()) \|\|
	AM.BaseType == X86ISelAddressMode::FrameIndexBase)
	--Cost;
	// If the folded LHS was interesting, this transformation saves
	// address arithmetic.
	if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
	((AM.Disp != 0) && (Backup.Disp == 0)) +
	(AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
	--Cost;
	// If it doesn't look like it may be an overall win, don't do it.
	if (Cost >= 0) {
	AM = Backup;
	break;
	}

	// Ok, the transformation is legal and appears profitable. Go for it.
	SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType());
	SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
	AM.IndexReg = Neg;
	AM.Scale = 1;

	// Insert the new nodes into the topological ordering.
	insertDAGNode(*CurDAG, Handle.getValue(), Zero);
	insertDAGNode(*CurDAG, Handle.getValue(), Neg);
	return false;
	}

	case ISD::ADD:
	if (!matchAdd(N, AM, Depth))
	return false;
	break;

	case ISD::OR:
	// We want to look through a transform in InstCombine and DAGCombiner that
	// turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
	// Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
	// An 'lea' can then be used to match the shift (multiply) and add:
	// and $1, %esi
	// lea (%rsi, %rdi, 8), %rax
	if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
	!matchAdd(N, AM, Depth))
	return false;
	break;

	case ISD::AND: {
	// Perform some heroic transforms on an and of a constant-count shift
	// with a constant to enable use of the scaled offset field.

	// Scale must not be used already.
	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != 1) break;

	SDValue Shift = N.getOperand(0);
	if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
	SDValue X = Shift.getOperand(0);

	// We only handle up to 64-bit values here as those are what matter for
	// addressing mode optimizations.
	if (X.getSimpleValueType().getSizeInBits() > 64) break;

	if (!isa<ConstantSDNode>(N.getOperand(1)))
	break;
	uint64_t Mask = N.getConstantOperandVal(1);

	// Try to fold the mask and shift into an extract and scale.
	if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
	return false;

	// Try to fold the mask and shift directly into the scale.
	if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
	return false;

	// Try to swap the mask and shift to place shifts which can be done as
	// a scale on the outside of the mask.
	if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
	return false;
	break;
	}
	}

	return matchAddressBase(N, AM);
	}

	/// Helper for MatchAddress. Add the specified node to the
	/// specified addressing mode without any further recursion.
	bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
	// Is the base register already occupied?
	if (AM.BaseType != X86ISelAddressMode::RegBase \|\| AM.Base_Reg.getNode()) {
	// If so, check to see if the scale index register is set.
	if (!AM.IndexReg.getNode()) {
	AM.IndexReg = N;
	AM.Scale = 1;
	return false;
	}

	// Otherwise, we cannot select it.
	return true;
	}

	// Default, generate it as a register.
	AM.BaseType = X86ISelAddressMode::RegBase;
	AM.Base_Reg = N;
	return false;
	}

	/// Helper for selectVectorAddr. Handles things that can be folded into a
	/// gather scatter address. The index register and scale should have already
	/// been handled.
	bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
	// TODO: Support other operations.
	switch (N.getOpcode()) {
	case X86ISD::Wrapper:
	if (!matchWrapper(N, AM))
	return false;
	break;
	}

	return matchAddressBase(N, AM);
	}

	bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment) {
	X86ISelAddressMode AM;
	auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
	AM.IndexReg = Mgs->getIndex();
	AM.Scale = Mgs->getValue().getScalarValueSizeInBits() / 8;

	unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
	// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
	if (AddrSpace == 256)
	AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
	if (AddrSpace == 257)
	AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
	if (AddrSpace == 258)
	AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);

	// If Base is 0, the whole address is in index and the Scale is 1
	if (isa<ConstantSDNode>(N)) {
	assert(cast<ConstantSDNode>(N)->isNullValue() &&
	"Unexpected base in gather/scatter");
	AM.Scale = 1;
	}
	// Otherwise, try to match into the base and displacement fields.
	else if (matchVectorAddress(N, AM))
	return false;

	MVT VT = N.getSimpleValueType();
	if (AM.BaseType == X86ISelAddressMode::RegBase) {
	if (!AM.Base_Reg.getNode())
	AM.Base_Reg = CurDAG->getRegister(0, VT);
	}

	getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
	return true;
	}

	/// Returns true if it is able to pattern match an addressing mode.
	/// It returns the operands which make up the maximal addressing mode it can
	/// match by reference.
	///
	/// Parent is the parent node of the addr operand that is being matched. It
	/// is always a load, store, atomic node, or null. It is only null when
	/// checking memory operands for inline asm nodes.
	bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment) {
	X86ISelAddressMode AM;

	if (Parent &&
	// This list of opcodes are all the nodes that have an "addr:$ptr" operand
	// that are not a MemSDNode, and thus don't have proper addrspace info.
	Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
	Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
	Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
	Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
	Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
	unsigned AddrSpace =
	cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
	// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
	if (AddrSpace == 256)
	AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
	if (AddrSpace == 257)
	AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
	if (AddrSpace == 258)
	AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
	}

	if (matchAddress(N, AM))
	return false;

	MVT VT = N.getSimpleValueType();
	if (AM.BaseType == X86ISelAddressMode::RegBase) {
	if (!AM.Base_Reg.getNode())
	AM.Base_Reg = CurDAG->getRegister(0, VT);
	}

	if (!AM.IndexReg.getNode())
	AM.IndexReg = CurDAG->getRegister(0, VT);

	getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
	return true;
	}

	// We can only fold a load if all nodes between it and the root node have a
	// single use. If there are additional uses, we could end up duplicating the
	// load.
	static bool hasSingleUsesFromRoot(SDNode Root, SDNode N) {
	SDNode User = N->use_begin();
	while (User != Root) {
	if (!User->hasOneUse())
	return false;
	User = *User->use_begin();
	}

	return true;
	}

	/// Match a scalar SSE load. In particular, we want to match a load whose top
	/// elements are either undef or zeros. The load flavor is derived from the
	/// type of N, which is either v4f32 or v2f64.
	///
	/// We also return:
	/// PatternChainNode: this is the matched node that has a chain input and
	/// output.
	bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
	SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment,
	SDValue &PatternNodeWithChain) {
	// We can allow a full vector load here since narrowing a load is ok.
	if (ISD::isNON_EXTLoad(N.getNode())) {
	PatternNodeWithChain = N;
	if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
	IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) &&
	hasSingleUsesFromRoot(Root, N.getNode())) {
	LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
	return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
	Segment);
	}
	}

	// We can also match the special zero extended load opcode.
	if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
	PatternNodeWithChain = N;
	if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
	IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) &&
	hasSingleUsesFromRoot(Root, N.getNode())) {
	auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
	return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
	Segment);
	}
	}

	// Need to make sure that the SCALAR_TO_VECTOR and load are both only used
	// once. Otherwise the load might get duplicated and the chain output of the
	// duplicate load will not be observed by all dependencies.
	if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
	PatternNodeWithChain = N.getOperand(0);
	if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
	IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
	IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) &&
	hasSingleUsesFromRoot(Root, N.getNode())) {
	LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
	return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
	Segment);
	}
	}

	// Also handle the case where we explicitly require zeros in the top
	// elements. This is a vector shuffle from the zero vector.
	if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
	// Check to see if the top elements are all zeros (or bitcast of zeros).
	N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
	N.getOperand(0).getNode()->hasOneUse()) {
	PatternNodeWithChain = N.getOperand(0).getOperand(0);
	if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
	IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
	IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) &&
	hasSingleUsesFromRoot(Root, N.getNode())) {
	// Okay, this is a zero extending load. Fold it.
	LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
	return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
	Segment);
	}
	}

	return false;
	}


	bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
	if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
	uint64_t ImmVal = CN->getZExtValue();
	if (!isUInt<32>(ImmVal))
	return false;

	Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
	return true;
	}

	// In static codegen with small code model, we can get the address of a label
	// into a register with 'movl'. TableGen has already made sure we're looking
	// at a label of some kind.
	assert(N->getOpcode() == X86ISD::Wrapper &&
	"Unexpected node type for MOV32ri64");
	N = N.getOperand(0);

	// At least GNU as does not accept 'movl' for TPOFF relocations.
	// FIXME: We could use 'movl' when we know we are targeting MC.
	if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
	return false;

	Imm = N;
	if (N->getOpcode() != ISD::TargetGlobalAddress)
	return TM.getCodeModel() == CodeModel::Small;

	Optional<ConstantRange> CR =
	cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
	if (!CR)
	return TM.getCodeModel() == CodeModel::Small;

	return CR->getUnsignedMax().ult(1ull << 32);
	}

	bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment) {
	// Save the debug loc before calling selectLEAAddr, in case it invalidates N.
	SDLoc DL(N);

	if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
	return false;

	RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
	if (RN && RN->getReg() == 0)
	Base = CurDAG->getRegister(0, MVT::i64);
	else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) {
	// Base could already be %rip, particularly in the x32 ABI.
	Base = SDValue(CurDAG->getMachineNode(
	TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
	CurDAG->getTargetConstant(0, DL, MVT::i64),
	Base,
	CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)),
	0);
	}

	RN = dyn_cast<RegisterSDNode>(Index);
	if (RN && RN->getReg() == 0)
	Index = CurDAG->getRegister(0, MVT::i64);
	else {
	assert(Index.getValueType() == MVT::i32 &&
	"Expect to be extending 32-bit registers for use in LEA");
	Index = SDValue(CurDAG->getMachineNode(
	TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
	CurDAG->getTargetConstant(0, DL, MVT::i64),
	Index,
	CurDAG->getTargetConstant(X86::sub_32bit, DL,
	MVT::i32)),
	0);
	}

	return true;
	}

	/// Calls SelectAddr and determines if the maximal addressing
	/// mode it matches can be cost effectively emitted as an LEA instruction.
	bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment) {
	X86ISelAddressMode AM;

	// Save the DL and VT before calling matchAddress, it can invalidate N.
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();

	// Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
	// segments.
	SDValue Copy = AM.Segment;
	SDValue T = CurDAG->getRegister(0, MVT::i32);
	AM.Segment = T;
	if (matchAddress(N, AM))
	return false;
	assert (T == AM.Segment);
	AM.Segment = Copy;

	unsigned Complexity = 0;
	if (AM.BaseType == X86ISelAddressMode::RegBase)
	if (AM.Base_Reg.getNode())
	Complexity = 1;
	else
	AM.Base_Reg = CurDAG->getRegister(0, VT);
	else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
	Complexity = 4;

	if (AM.IndexReg.getNode())
	Complexity++;
	else
	AM.IndexReg = CurDAG->getRegister(0, VT);

	// Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
	// a simple shift.
	if (AM.Scale > 1)
	Complexity++;

	// FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
	// to a LEA. This is determined with some experimentation but is by no means
	// optimal (especially for code size consideration). LEA is nice because of
	// its three-address nature. Tweak the cost function again when we can run
	// convertToThreeAddress() at register allocation time.
	if (AM.hasSymbolicDisplacement()) {
	// For X86-64, always use LEA to materialize RIP-relative addresses.
	if (Subtarget->is64Bit())
	Complexity = 4;
	else
	Complexity += 2;
	}

	if (AM.Disp && (AM.Base_Reg.getNode() \|\| AM.IndexReg.getNode()))
	Complexity++;

	// If it isn't worth using an LEA, reject it.
	if (Complexity <= 2)
	return false;

	getAddressOperands(AM, DL, Base, Scale, Index, Disp, Segment);
	return true;
	}

	/// This is only run on TargetGlobalTLSAddress nodes.
	bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment) {
	assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);

	X86ISelAddressMode AM;
	AM.GV = GA->getGlobal();
	AM.Disp += GA->getOffset();
	AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
	AM.SymbolFlags = GA->getTargetFlags();

	if (N.getValueType() == MVT::i32) {
	AM.Scale = 1;
	AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
	} else {
	AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
	}

	getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
	return true;
	}

	bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
	if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
	Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
	N.getValueType());
	return true;
	}

	// Keep track of the original value type and whether this value was
	// truncated. If we see a truncation from pointer type to VT that truncates
	// bits that are known to be zero, we can use a narrow reference.
	EVT VT = N.getValueType();
	bool WasTruncated = false;
	if (N.getOpcode() == ISD::TRUNCATE) {
	WasTruncated = true;
	N = N.getOperand(0);
	}

	if (N.getOpcode() != X86ISD::Wrapper)
	return false;

	// We can only use non-GlobalValues as immediates if they were not truncated,
	// as we do not have any range information. If we have a GlobalValue and the
	// address was not truncated, we can select it as an operand directly.
	unsigned Opc = N.getOperand(0)->getOpcode();
	if (Opc != ISD::TargetGlobalAddress \|\| !WasTruncated) {
	Op = N.getOperand(0);
	// We can only select the operand directly if we didn't have to look past a
	// truncate.
	return !WasTruncated;
	}

	// Check that the global's range fits into VT.
	auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
	Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
	if (!CR \|\| CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
	return false;

	// Okay, we can use a narrow reference.
	Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
	GA->getOffset(), GA->getTargetFlags());
	return true;
	}

	bool X86DAGToDAGISel::tryFoldLoad(SDNode Root, SDNode P, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment) {
	if (!ISD::isNON_EXTLoad(N.getNode()) \|\|
	!IsProfitableToFold(N, P, Root) \|\|
	!IsLegalToFold(N, P, Root, OptLevel))
	return false;

	return selectAddr(N.getNode(),
	N.getOperand(1), Base, Scale, Index, Disp, Segment);
	}

	/// Return an SDNode that returns the value of the global base register.
	/// Output instructions required to initialize the global base register,
	/// if necessary.
	SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
	unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
	auto &DL = MF->getDataLayout();
	return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
	}

	bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
	if (N->getOpcode() == ISD::TRUNCATE)
	N = N->getOperand(0).getNode();
	if (N->getOpcode() != X86ISD::Wrapper)
	return false;

	auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
	if (!GA)
	return false;

	Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
	return CR && CR->getSignedMin().sge(-1ull << Width) &&
	CR->getSignedMax().slt(1ull << Width);
	}

	/// Test whether the given X86ISD::CMP node has any uses which require the SF
	/// or OF bits to be accurate.
	static bool hasNoSignedComparisonUses(SDNode *N) {
	// Examine each user of the node.
	for (SDNode::use_iterator UI = N->use_begin(),
	UE = N->use_end(); UI != UE; ++UI) {
	// Only examine CopyToReg uses.
	if (UI->getOpcode() != ISD::CopyToReg)
	return false;
	// Only examine CopyToReg uses that copy to EFLAGS.
	if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
	X86::EFLAGS)
	return false;
	// Examine each user of the CopyToReg use.
	for (SDNode::use_iterator FlagUI = UI->use_begin(),
	FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
	// Only examine the Flag result.
	if (FlagUI.getUse().getResNo() != 1) continue;
	// Anything unusual: assume conservatively.
	if (!FlagUI->isMachineOpcode()) return false;
	// Examine the opcode of the user.
	switch (FlagUI->getMachineOpcode()) {
	// These comparisons don't treat the most significant bit specially.
	case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
	case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
	case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
	case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
	case X86::JA_1: case X86::JAE_1: case X86::JB_1: case X86::JBE_1:
	case X86::JE_1: case X86::JNE_1: case X86::JP_1: case X86::JNP_1:
	case X86::CMOVA16rr: case X86::CMOVA16rm:
	case X86::CMOVA32rr: case X86::CMOVA32rm:
	case X86::CMOVA64rr: case X86::CMOVA64rm:
	case X86::CMOVAE16rr: case X86::CMOVAE16rm:
	case X86::CMOVAE32rr: case X86::CMOVAE32rm:
	case X86::CMOVAE64rr: case X86::CMOVAE64rm:
	case X86::CMOVB16rr: case X86::CMOVB16rm:
	case X86::CMOVB32rr: case X86::CMOVB32rm:
	case X86::CMOVB64rr: case X86::CMOVB64rm:
	case X86::CMOVBE16rr: case X86::CMOVBE16rm:
	case X86::CMOVBE32rr: case X86::CMOVBE32rm:
	case X86::CMOVBE64rr: case X86::CMOVBE64rm:
	case X86::CMOVE16rr: case X86::CMOVE16rm:
	case X86::CMOVE32rr: case X86::CMOVE32rm:
	case X86::CMOVE64rr: case X86::CMOVE64rm:
	case X86::CMOVNE16rr: case X86::CMOVNE16rm:
	case X86::CMOVNE32rr: case X86::CMOVNE32rm:
	case X86::CMOVNE64rr: case X86::CMOVNE64rm:
	case X86::CMOVNP16rr: case X86::CMOVNP16rm:
	case X86::CMOVNP32rr: case X86::CMOVNP32rm:
	case X86::CMOVNP64rr: case X86::CMOVNP64rm:
	case X86::CMOVP16rr: case X86::CMOVP16rm:
	case X86::CMOVP32rr: case X86::CMOVP32rm:
	case X86::CMOVP64rr: case X86::CMOVP64rm:
	continue;
	// Anything else: assume conservatively.
	default: return false;
	}
	}
	}
	return true;
	}

	/// Test whether the given node which sets flags has any uses which require the
	/// CF flag to be accurate.
	static bool hasNoCarryFlagUses(SDNode *N) {
	// Examine each user of the node.
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
	++UI) {
	// Only check things that use the flags.
	if (UI.getUse().getResNo() != 1)
	continue;
	// Only examine CopyToReg uses.
	if (UI->getOpcode() != ISD::CopyToReg)
	return false;
	// Only examine CopyToReg uses that copy to EFLAGS.
	if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
	return false;
	// Examine each user of the CopyToReg use.
	for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
	FlagUI != FlagUE; ++FlagUI) {
	// Only examine the Flag result.
	if (FlagUI.getUse().getResNo() != 1)
	continue;
	// Anything unusual: assume conservatively.
	if (!FlagUI->isMachineOpcode())
	return false;
	// Examine the opcode of the user.
	switch (FlagUI->getMachineOpcode()) {
	// Comparisons which don't examine the CF flag.
	case X86::SETOr: case X86::SETNOr: case X86::SETEr: case X86::SETNEr:
	case X86::SETSr: case X86::SETNSr: case X86::SETPr: case X86::SETNPr:
	case X86::SETLr: case X86::SETGEr: case X86::SETLEr: case X86::SETGr:
	case X86::JO_1: case X86::JNO_1: case X86::JE_1: case X86::JNE_1:
	case X86::JS_1: case X86::JNS_1: case X86::JP_1: case X86::JNP_1:
	case X86::JL_1: case X86::JGE_1: case X86::JLE_1: case X86::JG_1:
	case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr:
	case X86::CMOVO16rm: case X86::CMOVO32rm: case X86::CMOVO64rm:
	case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr:
	case X86::CMOVNO16rm: case X86::CMOVNO32rm: case X86::CMOVNO64rm:
	case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
	case X86::CMOVE16rm: case X86::CMOVE32rm: case X86::CMOVE64rm:
	case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
	case X86::CMOVNE16rm: case X86::CMOVNE32rm: case X86::CMOVNE64rm:
	case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr:
	case X86::CMOVS16rm: case X86::CMOVS32rm: case X86::CMOVS64rm:
	case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
	case X86::CMOVNS16rm: case X86::CMOVNS32rm: case X86::CMOVNS64rm:
	case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr:
	case X86::CMOVP16rm: case X86::CMOVP32rm: case X86::CMOVP64rm:
	case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
	case X86::CMOVNP16rm: case X86::CMOVNP32rm: case X86::CMOVNP64rm:
	case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr:
	case X86::CMOVL16rm: case X86::CMOVL32rm: case X86::CMOVL64rm:
	case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
	case X86::CMOVGE16rm: case X86::CMOVGE32rm: case X86::CMOVGE64rm:
	case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
	case X86::CMOVLE16rm: case X86::CMOVLE32rm: case X86::CMOVLE64rm:
	case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr:
	case X86::CMOVG16rm: case X86::CMOVG32rm: case X86::CMOVG64rm:
	continue;
	// Anything else: assume conservatively.
	default:
	return false;
	}
	}
	}
	return true;
	}

	/// Check whether or not the chain ending in StoreNode is suitable for doing
	/// the {load; op; store} to modify transformation.
	static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
	SDValue StoredVal, SelectionDAG *CurDAG,
	LoadSDNode *&LoadNode,
	SDValue &InputChain) {
	// is the stored value result 0 of the load?
	if (StoredVal.getResNo() != 0) return false;

	// are there other uses of the loaded value than the inc or dec?
	if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;

	// is the store non-extending and non-indexed?
	if (!ISD::isNormalStore(StoreNode) \|\| StoreNode->isNonTemporal())
	return false;

	SDValue Load = StoredVal->getOperand(0);
	// Is the stored value a non-extending and non-indexed load?
	if (!ISD::isNormalLoad(Load.getNode())) return false;

	// Return LoadNode by reference.
	LoadNode = cast<LoadSDNode>(Load);

	// Is store the only read of the loaded value?
	if (!Load.hasOneUse())
	return false;

	// Is the address of the store the same as the load?
	if (LoadNode->getBasePtr() != StoreNode->getBasePtr() \|\|
	LoadNode->getOffset() != StoreNode->getOffset())
	return false;

	// Check if the chain is produced by the load or is a TokenFactor with
	// the load output chain as an operand. Return InputChain by reference.
	SDValue Chain = StoreNode->getChain();

	bool ChainCheck = false;
	if (Chain == Load.getValue(1)) {
	ChainCheck = true;
	InputChain = LoadNode->getChain();
	} else if (Chain.getOpcode() == ISD::TokenFactor) {
	SmallVector<SDValue, 4> ChainOps;
	for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
	SDValue Op = Chain.getOperand(i);
	if (Op == Load.getValue(1)) {
	ChainCheck = true;
	// Drop Load, but keep its chain. No cycle check necessary.
	ChainOps.push_back(Load.getOperand(0));
	continue;
	}

	// Make sure using Op as part of the chain would not cause a cycle here.
	// In theory, we could check whether the chain node is a predecessor of
	// the load. But that can be very expensive. Instead visit the uses and
	// make sure they all have smaller node id than the load.
	int LoadId = LoadNode->getNodeId();
	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
	UE = UI->use_end(); UI != UE; ++UI) {
	if (UI.getUse().getResNo() != 0)
	continue;
	if (UI->getNodeId() > LoadId)
	return false;
	}

	ChainOps.push_back(Op);
	}

	if (ChainCheck)
	// Make a new TokenFactor with all the other input chains except
	// for the load.
	InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
	MVT::Other, ChainOps);
	}
	if (!ChainCheck)
	return false;

	return true;
	}

	// Change a chain of {load; op; store} of the same value into a simple op
	// through memory of that value, if the uses of the modified value and its
	// address are suitable.
	//
	// The tablegen pattern memory operand pattern is currently not able to match
	// the case where the EFLAGS on the original operation are used.
	//
	// To move this to tablegen, we'll need to improve tablegen to allow flags to
	// be transferred from a node in the pattern to the result node, probably with
	// a new keyword. For example, we have this
	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
	// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
	// (implicit EFLAGS)]>;
	// but maybe need something like this
	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
	// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
	// (transferrable EFLAGS)]>;
	//
	// Until then, we manually fold these and instruction select the operation
	// here.
	bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
	StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
	SDValue StoredVal = StoreNode->getOperand(1);
	unsigned Opc = StoredVal->getOpcode();

	// Before we try to select anything, make sure this is memory operand size
	// and opcode we can handle. Note that this must match the code below that
	// actually lowers the opcodes.
	EVT MemVT = StoreNode->getMemoryVT();
	if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
	MemVT != MVT::i8)
	return false;
	switch (Opc) {
	default:
	return false;
	case X86ISD::INC:
	case X86ISD::DEC:
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::AND:
	case X86ISD::OR:
	case X86ISD::XOR:
	break;
	}

	LoadSDNode *LoadNode = nullptr;
	SDValue InputChain;
	if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadNode,
	InputChain))
	return false;

	SDValue Base, Scale, Index, Disp, Segment;
	if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
	Segment))
	return false;

	auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
	unsigned Opc8) {
	switch (MemVT.getSimpleVT().SimpleTy) {
	case MVT::i64:
	return Opc64;
	case MVT::i32:
	return Opc32;
	case MVT::i16:
	return Opc16;
	case MVT::i8:
	return Opc8;
	default:
	llvm_unreachable("Invalid size!");
	}
	};

	MachineSDNode *Result;
	switch (Opc) {
	case X86ISD::INC:
	case X86ISD::DEC: {
	unsigned NewOpc =
	Opc == X86ISD::INC
	? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
	: SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
	const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
	Result =
	CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
	break;
	}
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::AND:
	case X86ISD::OR:
	case X86ISD::XOR: {
	auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
	switch (Opc) {
	case X86ISD::ADD:
	return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
	X86::ADD8mr);
	case X86ISD::SUB:
	return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
	X86::SUB8mr);
	case X86ISD::AND:
	return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
	X86::AND8mr);
	case X86ISD::OR:
	return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
	case X86ISD::XOR:
	return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
	X86::XOR8mr);
	default:
	llvm_unreachable("Invalid opcode!");
	}
	};
	auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
	switch (Opc) {
	case X86ISD::ADD:
	return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
	case X86ISD::SUB:
	return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
	case X86ISD::AND:
	return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
	case X86ISD::OR:
	return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
	case X86ISD::XOR:
	return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
	default:
	llvm_unreachable("Invalid opcode!");
	}
	};
	auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
	switch (Opc) {
	case X86ISD::ADD:
	return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
	X86::ADD8mi);
	case X86ISD::SUB:
	return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
	X86::SUB8mi);
	case X86ISD::AND:
	return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
	X86::AND8mi);
	case X86ISD::OR:
	return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
	X86::OR8mi);
	case X86ISD::XOR:
	return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
	X86::XOR8mi);
	default:
	llvm_unreachable("Invalid opcode!");
	}
	};

	unsigned NewOpc = SelectRegOpcode(Opc);
	SDValue Operand = StoredVal->getOperand(1);

	// See if the operand is a constant that we can fold into an immediate
	// operand.
	if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
	auto OperandV = OperandC->getAPIntValue();

	// Check if we can shrink the operand enough to fit in an immediate (or
	// fit into a smaller immediate) by negating it and switching the
	// operation.
	if ((Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB) &&
	((MemVT != MVT::i8 && OperandV.getMinSignedBits() > 8 &&
	(-OperandV).getMinSignedBits() <= 8) \|\|
	(MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 &&
	(-OperandV).getMinSignedBits() <= 32)) &&
	hasNoCarryFlagUses(StoredVal.getNode())) {
	OperandV = -OperandV;
	Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
	}

	// First try to fit this into an Imm8 operand. If it doesn't fit, then try
	// the larger immediate operand.
	if (MemVT != MVT::i8 && OperandV.getMinSignedBits() <= 8) {
	Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
	NewOpc = SelectImm8Opcode(Opc);
	} else if (OperandV.getActiveBits() <= MemVT.getSizeInBits() &&
	(MemVT != MVT::i64 \|\| OperandV.getMinSignedBits() <= 32)) {
	Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
	NewOpc = SelectImmOpcode(Opc);
	}
	}

	const SDValue Ops[] = {Base, Scale, Index, Disp,
	Segment, Operand, InputChain};
	Result =
	CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
	break;
	}
	default:
	llvm_unreachable("Invalid opcode!");
	}

	MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
	MemOp[0] = StoreNode->getMemOperand();
	MemOp[1] = LoadNode->getMemOperand();
	Result->setMemRefs(MemOp, MemOp + 2);

	ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
	ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
	CurDAG->RemoveDeadNode(Node);
	return true;
	}

	// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
	bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
	MVT NVT = Node->getSimpleValueType(0);
	SDLoc dl(Node);

	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	if (!Subtarget->hasBMI() && !Subtarget->hasTBM())
	return false;

	// Must have a shift right.
	if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
	return false;

	// Shift can't have additional users.
	if (!N0->hasOneUse())
	return false;

	// Only supported for 32 and 64 bits.
	if (NVT != MVT::i32 && NVT != MVT::i64)
	return false;

	// Shift amount and RHS of and must be constant.
	ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
	ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
	if (!MaskCst \|\| !ShiftCst)
	return false;

	// And RHS must be a mask.
	uint64_t Mask = MaskCst->getZExtValue();
	if (!isMask_64(Mask))
	return false;

	uint64_t Shift = ShiftCst->getZExtValue();
	uint64_t MaskSize = countPopulation(Mask);

	// Don't interfere with something that can be handled by extracting AH.
	// TODO: If we are able to fold a load, BEXTR might still be better than AH.
	if (Shift == 8 && MaskSize == 8)
	return false;

	// Make sure we are only using bits that were in the original value, not
	// shifted in.
	if (Shift + MaskSize > NVT.getSizeInBits())
	return false;

	SDValue New = CurDAG->getTargetConstant(Shift \| (MaskSize << 8), dl, NVT);
	unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
	unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;

	// BMI requires the immediate to placed in a register.
	if (!Subtarget->hasTBM()) {
	ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
	MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
	New = SDValue(CurDAG->getMachineNode(X86::MOV32ri, dl, NVT, New), 0);
	if (NVT == MVT::i64) {
	New =
	SDValue(CurDAG->getMachineNode(
	TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
	CurDAG->getTargetConstant(0, dl, MVT::i64), New,
	CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
	0);
	}
	}

	MachineSDNode *NewNode;
	SDValue Input = N0->getOperand(0);
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
	SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
	NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
	// Update the chain.
	ReplaceUses(Input.getValue(1), SDValue(NewNode, 1));
	// Record the mem-refs
	MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
	MemOp[0] = cast<LoadSDNode>(Input)->getMemOperand();
	NewNode->setMemRefs(MemOp, MemOp + 1);
	} else {
	NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
	}

	ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
	CurDAG->RemoveDeadNode(Node);
	return true;
	}

	void X86DAGToDAGISel::Select(SDNode *Node) {
	MVT NVT = Node->getSimpleValueType(0);
	unsigned Opc, MOpc;
	unsigned Opcode = Node->getOpcode();
	SDLoc dl(Node);

	DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');

	if (Node->isMachineOpcode()) {
	DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
	Node->setNodeId(-1);
	return; // Already selected.
	}

	switch (Opcode) {
	default: break;
	case ISD::BRIND: {
	if (Subtarget->isTargetNaCl())
	// NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
	// leave the instruction alone.
	break;
	if (Subtarget->isTarget64BitILP32()) {
	// Converts a 32-bit register to a 64-bit, zero-extended version of
	// it. This is needed because x86-64 can do many things, but jmp %r32
	// ain't one of them.
	const SDValue &Target = Node->getOperand(1);
	assert(Target.getSimpleValueType() == llvm::MVT::i32);
	SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
	SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
	Node->getOperand(0), ZextTarget);
	ReplaceNode(Node, Brind.getNode());
	SelectCode(ZextTarget.getNode());
	SelectCode(Brind.getNode());
	return;
	}
	break;
	}
	case X86ISD::GlobalBaseReg:
	ReplaceNode(Node, getGlobalBaseReg());
	return;

	case X86ISD::SELECT:
	case X86ISD::SHRUNKBLEND: {
	// SHRUNKBLEND selects like a regular VSELECT. Same with X86ISD::SELECT.
	SDValue VSelect = CurDAG->getNode(
	ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2));
	ReplaceNode(Node, VSelect.getNode());
	SelectCode(VSelect.getNode());
	// We already called ReplaceUses.
	return;
	}

	case ISD::AND:
	// Try to match BEXTR/BEXTRI instruction.
	if (matchBEXTRFromAnd(Node))
	return;

	LLVM_FALLTHROUGH;
	case ISD::OR:
	case ISD::XOR: {

	// For operations of the form (x << C1) op C2, check if we can use a smaller
	// encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	if (N0->getOpcode() != ISD::SHL \|\| !N0->hasOneUse())
	break;

	// i8 is unshrinkable, i16 should be promoted to i32.
	if (NVT != MVT::i32 && NVT != MVT::i64)
	break;

	ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
	ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
	if (!Cst \|\| !ShlCst)
	break;

	int64_t Val = Cst->getSExtValue();
	uint64_t ShlVal = ShlCst->getZExtValue();

	// Make sure that we don't change the operation by removing bits.
	// This only matters for OR and XOR, AND is unaffected.
	uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
	if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
	break;

	unsigned ShlOp, AddOp, Op;
	MVT CstVT = NVT;

	// Check the minimum bitwidth for the new constant.
	// TODO: AND32ri is the same as AND64ri32 with zext imm.
	// TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
	// TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
	if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
	CstVT = MVT::i8;
	else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
	CstVT = MVT::i32;

	// Bail if there is no smaller encoding.
	if (NVT == CstVT)
	break;

	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i32:
	assert(CstVT == MVT::i8);
	ShlOp = X86::SHL32ri;
	AddOp = X86::ADD32rr;

	switch (Opcode) {
	default: llvm_unreachable("Impossible opcode");
	case ISD::AND: Op = X86::AND32ri8; break;
	case ISD::OR: Op = X86::OR32ri8; break;
	case ISD::XOR: Op = X86::XOR32ri8; break;
	}
	break;
	case MVT::i64:
	assert(CstVT == MVT::i8 \|\| CstVT == MVT::i32);
	ShlOp = X86::SHL64ri;
	AddOp = X86::ADD64rr;

	switch (Opcode) {
	default: llvm_unreachable("Impossible opcode");
	case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
	case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break;
	case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
	}
	break;
	}

	// Emit the smaller op and the shift.
	SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT);
	SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
	if (ShlVal == 1)
	CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
	SDValue(New, 0));
	else
	CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
	getI8Imm(ShlVal, dl));
	return;
	}
	case X86ISD::UMUL8:
	case X86ISD::SMUL8: {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);

	SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL,
	N0, SDValue()).getValue(1);

	SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32);
	SDValue Ops[] = {N1, InFlag};
	SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);

	ReplaceNode(Node, CNode);
	return;
	}

	case X86ISD::UMUL: {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	unsigned LoReg;
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	// MVT::i8 is handled by X86ISD::UMUL8.
	case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break;
	case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
	case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
	}

	SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
	N0, SDValue()).getValue(1);

	SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
	SDValue Ops[] = {N1, InFlag};
	SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);

	ReplaceNode(Node, CNode);
	return;
	}

	case ISD::SMUL_LOHI:
	case ISD::UMUL_LOHI: {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	bool isSigned = Opcode == ISD::SMUL_LOHI;
	bool hasBMI2 = Subtarget->hasBMI2();
	if (!isSigned) {
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break;
	case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
	case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
	MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
	case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
	MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
	}
	} else {
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break;
	case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
	case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
	case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
	}
	}

	unsigned SrcReg, LoReg, HiReg;
	switch (Opc) {
	default: llvm_unreachable("Unknown MUL opcode!");
	case X86::IMUL8r:
	case X86::MUL8r:
	SrcReg = LoReg = X86::AL; HiReg = X86::AH;
	break;
	case X86::IMUL16r:
	case X86::MUL16r:
	SrcReg = LoReg = X86::AX; HiReg = X86::DX;
	break;
	case X86::IMUL32r:
	case X86::MUL32r:
	SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
	break;
	case X86::IMUL64r:
	case X86::MUL64r:
	SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
	break;
	case X86::MULX32rr:
	SrcReg = X86::EDX; LoReg = HiReg = 0;
	break;
	case X86::MULX64rr:
	SrcReg = X86::RDX; LoReg = HiReg = 0;
	break;
	}

	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
	// Multiply is commmutative.
	if (!foldedLoad) {
	foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
	if (foldedLoad)
	std::swap(N0, N1);
	}

	SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
	N0, SDValue()).getValue(1);
	SDValue ResHi, ResLo;

	if (foldedLoad) {
	SDValue Chain;
	MachineSDNode *CNode = nullptr;
	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
	InFlag };
	if (MOpc == X86::MULX32rm \|\| MOpc == X86::MULX64rm) {
	SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
	CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
	ResHi = SDValue(CNode, 0);
	ResLo = SDValue(CNode, 1);
	Chain = SDValue(CNode, 2);
	InFlag = SDValue(CNode, 3);
	} else {
	SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
	CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
	Chain = SDValue(CNode, 0);
	InFlag = SDValue(CNode, 1);
	}

	// Update the chain.
	ReplaceUses(N1.getValue(1), Chain);
	// Record the mem-refs
	MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
	MemOp[0] = cast<LoadSDNode>(N1)->getMemOperand();
	CNode->setMemRefs(MemOp, MemOp + 1);
	} else {
	SDValue Ops[] = { N1, InFlag };
	if (Opc == X86::MULX32rr \|\| Opc == X86::MULX64rr) {
	SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
	SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
	ResHi = SDValue(CNode, 0);
	ResLo = SDValue(CNode, 1);
	InFlag = SDValue(CNode, 2);
	} else {
	SDVTList VTs = CurDAG->getVTList(MVT::Glue);
	SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
	InFlag = SDValue(CNode, 0);
	}
	}

	// Prevent use of AH in a REX instruction by referencing AX instead.
	if (HiReg == X86::AH && Subtarget->is64Bit() &&
	!SDValue(Node, 1).use_empty()) {
	SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
	X86::AX, MVT::i16, InFlag);
	InFlag = Result.getValue(2);
	// Get the low part if needed. Don't use getCopyFromReg for aliasing
	// registers.
	if (!SDValue(Node, 0).use_empty())
	ReplaceUses(SDValue(Node, 0),
	CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));

	// Shift AX down 8 bits.
	Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
	Result,
	CurDAG->getTargetConstant(8, dl, MVT::i8)),
	0);
	// Then truncate it down to i8.
	ReplaceUses(SDValue(Node, 1),
	CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
	}
	// Copy the low half of the result, if it is needed.
	if (!SDValue(Node, 0).use_empty()) {
	if (!ResLo.getNode()) {
	assert(LoReg && "Register for low half is not defined!");
	ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
	InFlag);
	InFlag = ResLo.getValue(2);
	}
	ReplaceUses(SDValue(Node, 0), ResLo);
	DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
	}
	// Copy the high half of the result, if it is needed.
	if (!SDValue(Node, 1).use_empty()) {
	if (!ResHi.getNode()) {
	assert(HiReg && "Register for high half is not defined!");
	ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
	InFlag);
	InFlag = ResHi.getValue(2);
	}
	ReplaceUses(SDValue(Node, 1), ResHi);
	DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
	}

	CurDAG->RemoveDeadNode(Node);
	return;
	}

	case ISD::SDIVREM:
	case ISD::UDIVREM:
	case X86ISD::SDIVREM8_SEXT_HREG:
	case X86ISD::UDIVREM8_ZEXT_HREG: {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	bool isSigned = (Opcode == ISD::SDIVREM \|\|
	Opcode == X86ISD::SDIVREM8_SEXT_HREG);
	if (!isSigned) {
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
	case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
	case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
	case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
	}
	} else {
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
	case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
	case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
	case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
	}
	}

	unsigned LoReg, HiReg, ClrReg;
	unsigned SExtOpcode;
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i8:
	LoReg = X86::AL; ClrReg = HiReg = X86::AH;
	SExtOpcode = X86::CBW;
	break;
	case MVT::i16:
	LoReg = X86::AX; HiReg = X86::DX;
	ClrReg = X86::DX;
	SExtOpcode = X86::CWD;
	break;
	case MVT::i32:
	LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
	SExtOpcode = X86::CDQ;
	break;
	case MVT::i64:
	LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
	SExtOpcode = X86::CQO;
	break;
	}

	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
	bool signBitIsZero = CurDAG->SignBitIsZero(N0);

	SDValue InFlag;
	if (NVT == MVT::i8 && (!isSigned \|\| signBitIsZero)) {
	// Special case for div8, just use a move with zero extension to AX to
	// clear the upper 8 bits (AH).
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
	if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
	Move =
	SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
	MVT::Other, Ops), 0);
	Chain = Move.getValue(1);
	ReplaceUses(N0.getValue(1), Chain);
	} else {
	Move =
	SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0);
	Chain = CurDAG->getEntryNode();
	}
	Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue());
	InFlag = Chain.getValue(1);
	} else {
	InFlag =
	CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
	LoReg, N0, SDValue()).getValue(1);
	if (isSigned && !signBitIsZero) {
	// Sign extend the low part into the high part.
	InFlag =
	SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
	} else {
	// Zero out the high part, effectively zero extending the input.
	SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
	switch (NVT.SimpleTy) {
	case MVT::i16:
	ClrNode =
	SDValue(CurDAG->getMachineNode(
	TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
	CurDAG->getTargetConstant(X86::sub_16bit, dl,
	MVT::i32)),
	0);
	break;
	case MVT::i32:
	break;
	case MVT::i64:
	ClrNode =
	SDValue(CurDAG->getMachineNode(
	TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
	CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
	CurDAG->getTargetConstant(X86::sub_32bit, dl,
	MVT::i32)),
	0);
	break;
	default:
	llvm_unreachable("Unexpected division source");
	}

	InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
	ClrNode, InFlag).getValue(1);
	}
	}

	if (foldedLoad) {
	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
	InFlag };
	MachineSDNode *CNode =
	CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
	InFlag = SDValue(CNode, 1);
	// Update the chain.
	ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
	// Record the mem-refs
	MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
	MemOp[0] = cast<LoadSDNode>(N1)->getMemOperand();
	CNode->setMemRefs(MemOp, MemOp + 1);
	} else {
	InFlag =
	SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
	}

	// Prevent use of AH in a REX instruction by explicitly copying it to
	// an ABCD_L register.
	//
	// The current assumption of the register allocator is that isel
	// won't generate explicit references to the GR8_ABCD_H registers. If
	// the allocator and/or the backend get enhanced to be more robust in
	// that regard, this can be, and should be, removed.
	if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
	SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
	unsigned AHExtOpcode =
	isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8;

	SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
	MVT::Glue, AHCopy, InFlag);
	SDValue Result(RNode, 0);
	InFlag = SDValue(RNode, 1);

	if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG \|\|
	Opcode == X86ISD::SDIVREM8_SEXT_HREG) {
	assert(Node->getValueType(1) == MVT::i32 && "Unexpected result type!");
	} else {
	Result =
	CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
	}
	ReplaceUses(SDValue(Node, 1), Result);
	DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
	}
	// Copy the division (low) result, if it is needed.
	if (!SDValue(Node, 0).use_empty()) {
	SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
	LoReg, NVT, InFlag);
	InFlag = Result.getValue(2);
	ReplaceUses(SDValue(Node, 0), Result);
	DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
	}
	// Copy the remainder (high) result, if it is needed.
	if (!SDValue(Node, 1).use_empty()) {
	SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
	HiReg, NVT, InFlag);
	InFlag = Result.getValue(2);
	ReplaceUses(SDValue(Node, 1), Result);
	DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
	}
	CurDAG->RemoveDeadNode(Node);
	return;
	}

	case X86ISD::CMP:
	case X86ISD::SUB: {
	// Sometimes a SUB is used to perform comparison.
	if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
	// This node is not a CMP.
	break;
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
	hasNoSignedComparisonUses(Node))
	N0 = N0.getOperand(0);

	// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
	// use a smaller encoding.
	// Look past the truncate if CMP is the only use of it.
	if ((N0.getOpcode() == ISD::AND \|\|
	(N0.getResNo() == 0 && N0.getOpcode() == X86ISD::AND)) &&
	N0.getNode()->hasOneUse() &&
	N0.getValueType() != MVT::i8 &&
	X86::isZeroNode(N1)) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!C) break;
	uint64_t Mask = C->getZExtValue();

	// For example, convert "testl %eax, $8" to "testb %al, $8"
	if (isUInt<8>(Mask) &&
	(!(Mask & 0x80) \|\| hasNoSignedComparisonUses(Node))) {
	SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i8);
	SDValue Reg = N0.getOperand(0);

	// Extract the l-register.
	SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
	MVT::i8, Reg);

	// Emit a testb.
	SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
	Subreg, Imm);
	// Replace SUB\|CMP with TEST, since SUB has two outputs while TEST has
	// one, do not call ReplaceAllUsesWith.
	ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
	SDValue(NewNode, 0));
	CurDAG->RemoveDeadNode(Node);
	return;
	}

	// For example, "testl %eax, $2048" to "testb %ah, $8".
	if (isShiftedUInt<8, 8>(Mask) &&
	(!(Mask & 0x8000) \|\| hasNoSignedComparisonUses(Node))) {
	// Shift the immediate right by 8 bits.
	SDValue ShiftedImm = CurDAG->getTargetConstant(Mask >> 8, dl, MVT::i8);
	SDValue Reg = N0.getOperand(0);

	// Extract the h-register.
	SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
	MVT::i8, Reg);

	// Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only
	// target GR8_NOREX registers, so make sure the register class is
	// forced.
	SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
	MVT::i32, Subreg, ShiftedImm);
	// Replace SUB\|CMP with TEST, since SUB has two outputs while TEST has
	// one, do not call ReplaceAllUsesWith.
	ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
	SDValue(NewNode, 0));
	CurDAG->RemoveDeadNode(Node);
	return;
	}

	// For example, "testl %eax, $32776" to "testw %ax, $32776".
	// NOTE: We only want to form TESTW instructions if optimizing for
	// min size. Otherwise we only save one byte and possibly get a length
	// changing prefix penalty in the decoders.
	if (OptForMinSize && isUInt<16>(Mask) && N0.getValueType() != MVT::i16 &&
	(!(Mask & 0x8000) \|\| hasNoSignedComparisonUses(Node))) {
	SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i16);
	SDValue Reg = N0.getOperand(0);

	// Extract the 16-bit subregister.
	SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
	MVT::i16, Reg);

	// Emit a testw.
	SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
	Subreg, Imm);
	// Replace SUB\|CMP with TEST, since SUB has two outputs while TEST has
	// one, do not call ReplaceAllUsesWith.
	ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
	SDValue(NewNode, 0));
	CurDAG->RemoveDeadNode(Node);
	return;
	}

	// For example, "testq %rax, $268468232" to "testl %eax, $268468232".
	if (isUInt<32>(Mask) && N0.getValueType() == MVT::i64 &&
	(!(Mask & 0x80000000) \|\| hasNoSignedComparisonUses(Node))) {
	SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i32);
	SDValue Reg = N0.getOperand(0);

	// Extract the 32-bit subregister.
	SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
	MVT::i32, Reg);

	// Emit a testl.
	SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
	Subreg, Imm);
	// Replace SUB\|CMP with TEST, since SUB has two outputs while TEST has
	// one, do not call ReplaceAllUsesWith.
	ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
	SDValue(NewNode, 0));
	CurDAG->RemoveDeadNode(Node);
	return;
	}
	}
	break;
	}
	case ISD::STORE:
	if (foldLoadStoreIntoMemOperand(Node))
	return;
	break;
	}

	SelectCode(Node);
	}

	bool X86DAGToDAGISel::
	SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
	std::vector<SDValue> &OutOps) {
	SDValue Op0, Op1, Op2, Op3, Op4;
	switch (ConstraintID) {
	default:
	llvm_unreachable("Unexpected asm memory constraint");
	case InlineAsm::Constraint_i:
	// FIXME: It seems strange that 'i' is needed here since it's supposed to
	// be an immediate and not a memory constraint.
	LLVM_FALLTHROUGH;
	case InlineAsm::Constraint_o: // offsetable ??
	case InlineAsm::Constraint_v: // not offsetable ??
	case InlineAsm::Constraint_m: // memory
	case InlineAsm::Constraint_X:
	if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
	return true;
	break;
	}

	OutOps.push_back(Op0);
	OutOps.push_back(Op1);
	OutOps.push_back(Op2);
	OutOps.push_back(Op3);
	OutOps.push_back(Op4);
	return false;
	}

	/// This pass converts a legalized DAG into a X86-specific DAG,
	/// ready for instruction scheduling.
	FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
	CodeGenOpt::Level OptLevel) {
	return new X86DAGToDAGISel(TM, OptLevel);
	}
	Index: head/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 328817)
	@@ -1,38737 +1,38860 @@
	//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that X86 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "X86ISelLowering.h"
	#include "Utils/X86ShuffleDecode.h"
	#include "X86CallingConv.h"
	#include "X86FrameLowering.h"
	#include "X86InstrBuilder.h"
	#include "X86IntrinsicsInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86ShuffleDecodeConstantPool.h"
	#include "X86TargetMachine.h"
	#include "X86TargetObjectFile.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/CodeGen/IntrinsicLowering.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cctype>
	#include <numeric>
	using namespace llvm;

	#define DEBUG_TYPE "x86-isel"

	STATISTIC(NumTailCalls, "Number of tail calls");

	static cl::opt<bool> ExperimentalVectorWideningLegalization(
	"x86-experimental-vector-widening-legalization", cl::init(false),
	cl::desc("Enable an experimental vector type legalization through widening "
	"rather than promotion."),
	cl::Hidden);

	static cl::opt<int> ExperimentalPrefLoopAlignment(
	"x86-experimental-pref-loop-alignment", cl::init(4),
	cl::desc("Sets the preferable loop alignment for experiments "
	"(the last x86-experimental-pref-loop-alignment bits"
	" of the loop header PC will be 0)."),
	cl::Hidden);

	static cl::opt<bool> MulConstantOptimization(
	"mul-constant-optimization", cl::init(true),
	cl::desc("Replace 'mul x, Const' with more effective instructions like "
	"SHIFT, LEA, etc."),
	cl::Hidden);

	/// Call this when the user attempts to do something unsupported, like
	/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
	/// report_fatal_error, so calling code should attempt to recover without
	/// crashing.
	static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
	const char *Msg) {
	MachineFunction &MF = DAG.getMachineFunction();
	DAG.getContext()->diagnose(
	DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
	}

	X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
	const X86Subtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
	X86ScalarSSEf64 = Subtarget.hasSSE2();
	X86ScalarSSEf32 = Subtarget.hasSSE1();
	MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());

	// Set up the TargetLowering object.

	// X86 is weird. It always uses i8 for shift amounts and setcc results.
	setBooleanContents(ZeroOrOneBooleanContent);
	// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// For 64-bit, since we have so many registers, use the ILP scheduler.
	// For 32-bit, use the register pressure specific scheduling.
	// For Atom, always use ILP scheduling.
	if (Subtarget.isAtom())
	setSchedulingPreference(Sched::ILP);
	else if (Subtarget.is64Bit())
	setSchedulingPreference(Sched::ILP);
	else
	setSchedulingPreference(Sched::RegPressure);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

	// Bypass expensive divides and use cheaper ones.
	if (TM.getOptLevel() >= CodeGenOpt::Default) {
	if (Subtarget.hasSlowDivide32())
	addBypassSlowDiv(32, 8);
	if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
	addBypassSlowDiv(64, 32);
	}

	if (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()) {
	// Setup Windows compiler runtime calls.
	setLibcallName(RTLIB::SDIV_I64, "_alldiv");
	setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
	setLibcallName(RTLIB::SREM_I64, "_allrem");
	setLibcallName(RTLIB::UREM_I64, "_aullrem");
	setLibcallName(RTLIB::MUL_I64, "_allmul");
	setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
	setUseUnderscoreSetJmp(false);
	setUseUnderscoreLongJmp(false);
	} else if (Subtarget.isTargetWindowsGNU()) {
	// MS runtime is weird: it exports _setjmp, but longjmp!
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(false);
	} else {
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(true);
	}

	// Set up the register classes.
	addRegisterClass(MVT::i8, &X86::GR8RegClass);
	addRegisterClass(MVT::i16, &X86::GR16RegClass);
	addRegisterClass(MVT::i32, &X86::GR32RegClass);
	if (Subtarget.is64Bit())
	addRegisterClass(MVT::i64, &X86::GR64RegClass);

	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

	// We don't accept any truncstore of integer registers.
	setTruncStoreAction(MVT::i64, MVT::i32, Expand);
	setTruncStoreAction(MVT::i64, MVT::i16, Expand);
	setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i32, MVT::i16, Expand);
	setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i16, MVT::i8, Expand);

	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// SETOEQ and SETUNE require checking two conditions.
	setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);

	// Integer absolute.
	if (Subtarget.hasCMov()) {
	setOperationAction(ISD::ABS , MVT::i16 , Custom);
	setOperationAction(ISD::ABS , MVT::i32 , Custom);
	if (Subtarget.is64Bit())
	setOperationAction(ISD::ABS , MVT::i64 , Custom);
	}

	// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
	// operation.
	setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
	// f32/f64 are legal, f80 is custom.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	else
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	} else if (!Subtarget.useSoftFloat()) {
	// We have an algorithm for SSE2->double, and we turn this into a
	// 64-bit FILD followed by conditional FADD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	// We have an algorithm for SSE2, and we turn this into a 64-bit
	// FILD or VCVTUSI2SS/SD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	}

	// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// SSE has no i16 to fp conversion, only i32.
	if (X86ScalarSSEf32) {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	// f32 and f64 cases are Legal, f80 case is not
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	}
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
	}

	// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
	// are Legal, f80 is custom lowered.
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);

	if (X86ScalarSSEf32) {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
	// f32 and f64 cases are Legal, f80 case is not
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
	}
	} else {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
	}

	// Handle FP_TO_UINT by promoting the destination to a larger signed
	// conversion.
	setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	// FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
	}
	} else if (!Subtarget.useSoftFloat()) {
	// Since AVX is a superset of SSE3, only check for SSE here.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
	// Expand FP_TO_UINT into a select.
	// FIXME: We would like to use a Custom expander here eventually to do
	// the optimal thing for SSE vs. the default expansion in the legalizer.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
	else
	// With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
	// With SSE3 we can use fisttpll to convert to a signed i64; without
	// SSE, we're stuck with a fistpll.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);

	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	}

	// TODO: when we have SSE, these could be more efficient, by using movd/movq.
	if (!X86ScalarSSEf64) {
	setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
	setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
	// Without SSE, i64->f64 goes through memory.
	setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
	}
	} else if (!Subtarget.is64Bit())
	setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

	// Scalar integer divide and remainder are lowered to use operations that
	// produce two results, to match the available instructions. This exposes
	// the two-result form to trivial CSE, which is able to combine x/y and x%y
	// into a single instruction.
	//
	// Scalar integer multiply-high is also lowered to use two-result
	// operations, to match the available instructions. However, plain multiply
	// (low) operations are left as Legal, as there are single-result
	// instructions for this in x86. Using the two-result multiply instructions
	// when both high and low results are needed must be arranged by dagcombine.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	}

	setOperationAction(ISD::BR_JT , MVT::Other, Expand);
	setOperationAction(ISD::BRCOND , MVT::Other, Custom);
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
	MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::BR_CC, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	}
	if (Subtarget.is64Bit())
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
	setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);

	setOperationAction(ISD::FREM , MVT::f32 , Expand);
	setOperationAction(ISD::FREM , MVT::f64 , Expand);
	setOperationAction(ISD::FREM , MVT::f80 , Expand);
	setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

	// Promote the i8 variants and force them on up to i32 which has a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	if (!Subtarget.hasBMI()) {
	setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
	}
	}

	if (Subtarget.hasLZCNT()) {
	// When promoting the i8 variants, force them to i32 for a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	} else {
	setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
	}
	}

	// Special handling for half-precision floating point conversions.
	// If we don't have F16C support, then lower half float conversions
	// into library calls.
	if (Subtarget.useSoftFloat() \|\| !Subtarget.hasF16C()) {
	setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
	}

	// There's never any support for operations beyond MVT::f32.
	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
	setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);

	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f80, MVT::f16, Expand);

	if (Subtarget.hasPOPCNT()) {
	setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
	} else {
	setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
	if (Subtarget.is64Bit())
	setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
	}

	setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

	if (!Subtarget.hasMOVBE())
	setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

	// These should be promoted to a larger select which is supported.
	setOperationAction(ISD::SELECT , MVT::i1 , Promote);
	// X86 wants to expand cmov itself.
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}

	// Custom action for SELECT MMX and expand action for SELECT_CC MMX
	setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

	setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
	// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
	// LLVM/Clang supports zero-cost DWARF and SEH exception handling.
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
	setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
	if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
	setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

	// Darwin ABI issue.
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::ConstantPool , VT, Custom);
	setOperationAction(ISD::JumpTable , VT, Custom);
	setOperationAction(ISD::GlobalAddress , VT, Custom);
	setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
	setOperationAction(ISD::ExternalSymbol , VT, Custom);
	setOperationAction(ISD::BlockAddress , VT, Custom);
	}

	// 64-bit shl, sra, srl (iff 32-bit x86)
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SHL_PARTS, VT, Custom);
	setOperationAction(ISD::SRA_PARTS, VT, Custom);
	setOperationAction(ISD::SRL_PARTS, VT, Custom);
	}

	if (Subtarget.hasSSEPrefetch() \|\| Subtarget.has3DNow())
	setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

	setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

	// Expand certain atomics
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
	setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
	}

	if (Subtarget.hasCmpxchg16b()) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
	}

	// FIXME - use subtarget debug flags
	if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
	!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
	TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
	setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
	}

	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
	setOperationAction(ISD::VASTART , MVT::Other, Custom);
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	bool Is64Bit = Subtarget.is64Bit();
	setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

	// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
	setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
	setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

	if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
	// f32 and f64 use SSE.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
	: &X86::FR32RegClass);
	addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
	: &X86::FR64RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	// Use ANDPD to simulate FABS.
	setOperationAction(ISD::FABS, VT, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG, VT, Custom);

	// Use ANDPD and ORPD to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}

	// Lower this to MOVMSK plus an AND.
	setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
	setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

	// Expand FP immediates into loads from the stack, except for the special
	// cases we handle.
	addLegalFPImmediate(APFloat(+0.0)); // xorpd
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	} else if (UseX87 && X86ScalarSSEf32) {
	// Use SSE for f32, x87 for f64.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, &X86::FR32RegClass);
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);

	// Use ANDPS to simulate FABS.
	setOperationAction(ISD::FABS , MVT::f32, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG , MVT::f32, Custom);

	setOperationAction(ISD::UNDEF, MVT::f64, Expand);

	// Use ANDPS and ORPS to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

	// Special cases we handle for FP constants.
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , MVT::f64, Expand);
	setOperationAction(ISD::FCOS , MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	} else if (UseX87) {
	// f32 and f64 in x87.
	// Set up the FP register classes.
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);
	addRegisterClass(MVT::f32, &X86::RFP32RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	setOperationAction(ISD::UNDEF, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
	addLegalFPImmediate(APFloat(+0.0f)); // FLD0
	addLegalFPImmediate(APFloat(+1.0f)); // FLD1
	addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
	}

	// We don't support FMA.
	setOperationAction(ISD::FMA, MVT::f64, Expand);
	setOperationAction(ISD::FMA, MVT::f32, Expand);

	// Long double always uses X87, except f128 in MMX.
	if (UseX87) {
	if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::f128, &X86::FR128RegClass);
	ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
	setOperationAction(ISD::FABS , MVT::f128, Custom);
	setOperationAction(ISD::FNEG , MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
	}

	addRegisterClass(MVT::f80, &X86::RFP80RegClass);
	setOperationAction(ISD::UNDEF, MVT::f80, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
	{
	APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
	addLegalFPImmediate(TmpFlt); // FLD0
	TmpFlt.changeSign();
	addLegalFPImmediate(TmpFlt); // FLD0/FCHS

	bool ignored;
	APFloat TmpFlt2(+1.0);
	TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
	&ignored);
	addLegalFPImmediate(TmpFlt2); // FLD1
	TmpFlt2.changeSign();
	addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
	}

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , MVT::f80, Expand);
	setOperationAction(ISD::FCOS , MVT::f80, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

	setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
	setOperationAction(ISD::FCEIL, MVT::f80, Expand);
	setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
	setOperationAction(ISD::FRINT, MVT::f80, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
	setOperationAction(ISD::FMA, MVT::f80, Expand);
	}

	// Always use a library call for pow.
	setOperationAction(ISD::FPOW , MVT::f32 , Expand);
	setOperationAction(ISD::FPOW , MVT::f64 , Expand);
	setOperationAction(ISD::FPOW , MVT::f80 , Expand);

	setOperationAction(ISD::FLOG, MVT::f80, Expand);
	setOperationAction(ISD::FLOG2, MVT::f80, Expand);
	setOperationAction(ISD::FLOG10, MVT::f80, Expand);
	setOperationAction(ISD::FEXP, MVT::f80, Expand);
	setOperationAction(ISD::FEXP2, MVT::f80, Expand);
	setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
	setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

	// Some FP actions are always expanded for vector types.
	for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
	MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	}

	// First set operation action for all vector types to either promote
	// (for widening) or expand (for scalarization). Then we will selectively
	// turn on ones that can be effectively codegen'd.
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::FMA, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::SETCC, VT, Expand);
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
	setOperationAction(ISD::TRUNCATE, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
	setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
	setOperationAction(ISD::ANY_EXTEND, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(InnerVT, VT, Expand);

	setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

	// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
	// types, we have to deal with them whether we ask for Expansion or not.
	// Setting Expand causes its own optimisation problems though, so leave
	// them legal.
	if (VT.getVectorElementType() == MVT::i1)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
	// split/scalarized right now.
	if (VT.getVectorElementType() == MVT::f16)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
	}
	}

	// FIXME: In order to prevent SSE instructions being expanded to MMX ones
	// with -msoft-float, disable use of MMX as well.
	if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
	// No operations on x86mmx supported, everything uses intrinsics.
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
	addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
	setOperationAction(ISD::FABS, MVT::v4f32, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
	setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
	addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
	// registers cannot be used even for integer operations.
	addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::MUL, MVT::v16i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
	setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
	setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
	setOperationAction(ISD::MUL, MVT::v8i16, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
	setOperationAction(ISD::FABS, MVT::v2f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

	setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v16i8, Legal);

	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	}

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// We support custom legalizing of sext and anyext loads for specific
	// memory vector types which we can load as a scalar (or sequence of
	// scalars) and extend in-register to a legal 128-bit vector type. For sext
	// loads these must work with a single scalar load.
	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
	}

	for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);

	if (VT == MVT::v2i64 && !Subtarget.is64Bit())
	continue;

	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
	}

	// Custom lower v2i64 and v2f64 selects.
	setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

	// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
	setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);

	setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
	setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

	// In the customized shift lowering, the legal v4i32/v2i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
	setOperationAction(ISD::ABS, MVT::v16i8, Legal);
	setOperationAction(ISD::ABS, MVT::v8i16, Legal);
	setOperationAction(ISD::ABS, MVT::v4i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
	setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
	for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
	setOperationAction(ISD::FCEIL, RoundedTy, Legal);
	setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
	setOperationAction(ISD::FRINT, RoundedTy, Legal);
	setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
	}

	setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
	setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

	// FIXME: Do we need to handle scalar-to-vector here?
	setOperationAction(ISD::MUL, MVT::v4i32, Legal);

	// We directly match byte blends in the backend as they match the VSELECT
	// condition form.
	setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

	// SSE41 brings specific instructions for doing vector sign extend even in
	// cases where we don't have SRA.
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
	}

	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
	}

	// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
	}

	// i8 vectors are custom because the source register and source
	// source memory operand types are not the same width.
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::ROTL, VT, Custom);

	// XOP can efficiently perform BITREVERSE with VPPERM.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
	bool HasInt256 = Subtarget.hasInt256();

	addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);

	for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
	// even though v8i16 is a legal type.
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);

	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);

	// In the customized shift lowering, the legal v8i32/v4i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

	for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);
	}

	if (Subtarget.hasAnyFMA()) {
	for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
	MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::FMA, VT, Legal);
	}

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
	}

	setOperationAction(ISD::MUL, MVT::v4i64, Custom);
	setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v32i8, Custom);

	setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
	setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);

	setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
	}

	if (HasInt256) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);

	// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
	// when we have a 256bit-wide blend with immediate.
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

	// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
	}
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 128-bit but the source is 256-bit wide.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
	}

	// Custom lower several nodes for 256-bit types.
	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	}

	if (HasInt256)
	setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

	// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
	}

	if (HasInt256) {
	// Custom legalize 2x32 to get a little better code.
	setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
	setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MGATHER, VT, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
	addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
	addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
	addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

	addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
	addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
	addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

	setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i1, MVT::v4i32);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);

	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
	if (Subtarget.hasVLX()) {
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
	}

	// Extends of v16i1/v8i1 to 128-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v16i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i16, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i16, Custom);

	for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::TRUNCATE, VT, Custom);

	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
	for (auto VT : { MVT::v1i1, MVT::v8i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
	}

	for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
	MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
	MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
	setTruncStoreAction(VT, MaskVT, Custom);
	}

	for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FMA, VT, Legal);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
	setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);

	setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

	if (!Subtarget.hasVLX()) {
	// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
	// to 512-bit rather than use the AVX2 instructions so that we can use
	// k-masks.
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
	setOperationAction(ISD::MLOAD, VT, Custom);
	setOperationAction(ISD::MSTORE, VT, Custom);
	}
	}

	setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

	for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	}

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);

	// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);

	setOperationAction(ISD::MUL, MVT::v8i64, Custom);
	setOperationAction(ISD::MUL, MVT::v16i32, Legal);

	setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
	setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);

	setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v16f32, Custom);

	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	}

	// Need to promote to 64-bit even though we have 32-bit masked instructions
	// because the IR optimizers rearrange bitcasts around logic ops leaving
	// too many variations to handle if we don't promote them.
	setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
	setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
	setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);

	if (Subtarget.hasDQI()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
	}

	if (Subtarget.hasCDI()) {
	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v16i32, MVT::v8i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 256-bit but the source is 512-bit wide.
	// 128-bit was made Legal under AVX1.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

	for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::MGATHER, VT, Custom);
	setOperationAction(ISD::MSCATTER, VT, Custom);
	}
	for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
	}
	}// has AVX-512

	if (!Subtarget.useSoftFloat() &&
	(Subtarget.hasAVX512() \|\| Subtarget.hasVLX())) {
	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.
	// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);

	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MSCATTER, VT, Custom);

	if (Subtarget.hasDQI()) {
	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SINT_TO_FP, VT, Legal);
	setOperationAction(ISD::UINT_TO_FP, VT, Legal);
	setOperationAction(ISD::FP_TO_SINT, VT, Legal);
	setOperationAction(ISD::FP_TO_UINT, VT, Legal);
	}
	}

	if (Subtarget.hasCDI()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
	addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
	addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

	addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
	addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

	for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
	for (auto VT : { MVT::v16i1, MVT::v32i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	// Extends from v32i1 masks to 256-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
	// Extends from v64i1 masks to 512-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

	setOperationAction(ISD::MUL, MVT::v32i16, Legal);
	setOperationAction(ISD::MUL, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);

	setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

	for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);

	setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
	}

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
	}

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v64i8, MVT::v32i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() &&
	(Subtarget.hasAVX512() \|\| Subtarget.hasVLX())) {
	for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
	setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
	}

	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.
	// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
	addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
	addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

	for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}

	// TODO: v8i1 concat should be legal without VLX to support concats of
	// v1i1, but we won't legalize it correctly currently without introducing
	// a v4i1 concat in the middle.
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
	for (auto VT : { MVT::v2i1, MVT::v4i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	// Extends from v2i1/v4i1 masks to 128-bit vectors.
	setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Custom);

	setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

	if (Subtarget.hasDQI()) {
	// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
	// v2f32 UINT_TO_FP is already custom under SSE2.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
	assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
	"Unexpected operation action!");
	// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
	setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
	}

	if (Subtarget.hasBWI()) {
	setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
	}
	}

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
	if (!Subtarget.is64Bit()) {
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
	}

	// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
	// handle type legalization for these operations here.
	//
	// FIXME: We really should do custom legalization for addition and
	// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
	// than generic legalization for 64-bit multiplication-with-overflow, though.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	// Add/Sub/Mul with overflow operations are custom lowered.
	setOperationAction(ISD::SADDO, VT, Custom);
	setOperationAction(ISD::UADDO, VT, Custom);
	setOperationAction(ISD::SSUBO, VT, Custom);
	setOperationAction(ISD::USUBO, VT, Custom);
	setOperationAction(ISD::SMULO, VT, Custom);
	setOperationAction(ISD::UMULO, VT, Custom);

	// Support carry in as value rather than glue.
	setOperationAction(ISD::ADDCARRY, VT, Custom);
	setOperationAction(ISD::SUBCARRY, VT, Custom);
	setOperationAction(ISD::SETCCCARRY, VT, Custom);
	}

	if (!Subtarget.is64Bit()) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	setLibcallName(RTLIB::MUL_I128, nullptr);
	}

	// Combine sin / cos into _sincos_stret if it is available.
	if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
	getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	}

	if (Subtarget.isTargetWin64()) {
	setOperationAction(ISD::SDIV, MVT::i128, Custom);
	setOperationAction(ISD::UDIV, MVT::i128, Custom);
	setOperationAction(ISD::SREM, MVT::i128, Custom);
	setOperationAction(ISD::UREM, MVT::i128, Custom);
	setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
	setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
	}

	// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
	// is. We should promote the value to 64-bits to solve this.
	// This is what the CRT headers do - `fmodf` is an inline header
	// function casting to f64 and calling `fmod`.
	if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()))
	for (ISD::NodeType Op :
	{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
	ISD::FLOG10, ISD::FPOW, ISD::FSIN})
	if (isOperationExpand(Op, MVT::f32))
	setOperationAction(Op, MVT::f32, Promote);

	// We have target-specific dag combine patterns for the following nodes:
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
	setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
	setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::VSELECT);
	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::OR);
	setTargetDAGCombine(ISD::AND);
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::FADD);
	setTargetDAGCombine(ISD::FSUB);
	setTargetDAGCombine(ISD::FNEG);
	setTargetDAGCombine(ISD::FMA);
	setTargetDAGCombine(ISD::FMINNUM);
	setTargetDAGCombine(ISD::FMAXNUM);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::MLOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::MSTORE);
	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
	setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::MSCATTER);
	setTargetDAGCombine(ISD::MGATHER);

	computeRegisterProperties(Subtarget.getRegisterInfo());

	MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
	MaxStoresPerMemmoveOptSize = 4;

	// TODO: These control memcmp expansion in CGP and could be raised higher, but
	// that needs to benchmarked and balanced with the potential use of vector
	// load/store types (PR33329, PR33914).
	MaxLoadsPerMemcmp = 2;
	MaxLoadsPerMemcmpOptSize = 2;

	// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
	setPrefLoopAlignment(ExperimentalPrefLoopAlignment);

	// An out-of-order CPU can speculatively execute past a predictable branch,
	// but a conditional move could be stalled by an expensive earlier operation.
	PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
	EnableExtLdPromotion = true;
	setPrefFunctionAlignment(4); // 2^4 bytes.

	verifyIntrinsicTables();
	}

	// This has so far only been implemented for 64-bit MachO.
	bool X86TargetLowering::useLoadStackGuardNode() const {
	return Subtarget.isTargetMachO() && Subtarget.is64Bit();
	}

	bool X86TargetLowering::useStackGuardXorFP() const {
	// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
	return Subtarget.getTargetTriple().isOSMSVCRT();
	}

	SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
	const SDLoc &DL) const {
	EVT PtrTy = getPointerTy(DAG.getDataLayout());
	unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
	MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
	return SDValue(Node, 0);
	}

	TargetLoweringBase::LegalizeTypeAction
	X86TargetLowering::getPreferredVectorAction(EVT VT) const {
	if (ExperimentalVectorWideningLegalization &&
	VT.getVectorNumElements() != 1 &&
	VT.getVectorElementType().getSimpleVT() != MVT::i1)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
	LLVMContext& Context,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i8;

	if (Subtarget.hasAVX512()) {
	const unsigned NumElts = VT.getVectorNumElements();

	// Figure out what this type will be legalized to.
	EVT LegalVT = VT;
	while (getTypeAction(Context, LegalVT) != TypeLegal)
	LegalVT = getTypeToTransformTo(Context, LegalVT);

	// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
	if (LegalVT.getSimpleVT().is512BitVector())
	return EVT::getVectorVT(Context, MVT::i1, NumElts);

	if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
	// If we legalized to less than a 512-bit vector, then we will use a vXi1
	// compare for vXi32/vXi64 for sure. If we have BWI we will also support
	// vXi16/vXi8.
	MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
	if (Subtarget.hasBWI() \|\| EltVT.getSizeInBits() >= 32)
	return EVT::getVectorVT(Context, MVT::i1, NumElts);
	}
	}

	return VT.changeVectorElementTypeToInteger();
	}

	/// Helper for getByValTypeAlignment to determine
	/// the desired ByVal argument alignment.
	static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
	if (MaxAlign == 16)
	return;
	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	if (VTy->getBitWidth() == 128)
	MaxAlign = 16;
	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	unsigned EltAlign = 0;
	getMaxByValAlign(ATy->getElementType(), EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
	for (auto *EltTy : STy->elements()) {
	unsigned EltAlign = 0;
	getMaxByValAlign(EltTy, EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	if (MaxAlign == 16)
	break;
	}
	}
	}

	/// Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. For X86, aggregates
	/// that contain SSE vectors are placed at 16-byte boundaries while the rest
	/// are at 4-byte boundaries.
	unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	if (Subtarget.is64Bit()) {
	// Max of 8 and alignment of type.
	unsigned TyAlign = DL.getABITypeAlignment(Ty);
	if (TyAlign > 8)
	return TyAlign;
	return 8;
	}

	unsigned Align = 4;
	if (Subtarget.hasSSE1())
	getMaxByValAlign(Ty, Align);
	return Align;
	}

	/// Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	EVT
	X86TargetLowering::getOptimalMemOpType(uint64_t Size,
	unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset, bool ZeroMemset,
	bool MemcpyStrSrc,
	MachineFunction &MF) const {
	const Function &F = MF.getFunction();
	if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
	if (Size >= 16 &&
	(!Subtarget.isUnalignedMem16Slow() \|\|
	((DstAlign == 0 \|\| DstAlign >= 16) &&
	(SrcAlign == 0 \|\| SrcAlign >= 16)))) {
	// FIXME: Check if unaligned 32-byte accesses are slow.
	if (Size >= 32 && Subtarget.hasAVX()) {
	// Although this isn't a well-supported type for AVX1, we'll let
	// legalization and shuffle lowering produce the optimal codegen. If we
	// choose an optimal type with a vector element larger than a byte,
	// getMemsetStores() may create an intermediate splat (using an integer
	// multiply) before we splat as a vector.
	return MVT::v32i8;
	}
	if (Subtarget.hasSSE2())
	return MVT::v16i8;
	// TODO: Can SSE1 handle a byte vector?
	if (Subtarget.hasSSE1())
	return MVT::v4f32;
	} else if ((!IsMemset \|\| ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
	!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
	// Do not use f64 to lower memcpy if source is string constant. It's
	// better to use i32 to avoid the loads.
	// Also, do not use f64 to lower memset unless this is a memset of zeros.
	// The gymnastics of splatting a byte value into an XMM register and then
	// only using 8-byte stores (because this is a CPU with slow unaligned
	// 16-byte accesses) makes that a loser.
	return MVT::f64;
	}
	}
	// This is a compromise. If we reach here, unaligned accesses may be slow on
	// this target. However, creating smaller, aligned accesses could be even
	// slower and would certainly be a lot more code.
	if (Subtarget.is64Bit() && Size >= 8)
	return MVT::i64;
	return MVT::i32;
	}

	bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
	if (VT == MVT::f32)
	return X86ScalarSSEf32;
	else if (VT == MVT::f64)
	return X86ScalarSSEf64;
	return true;
	}

	bool
	X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
	unsigned,
	unsigned,
	bool *Fast) const {
	if (Fast) {
	switch (VT.getSizeInBits()) {
	default:
	// 8-byte and under are always assumed to be fast.
	*Fast = true;
	break;
	case 128:
	*Fast = !Subtarget.isUnalignedMem16Slow();
	break;
	case 256:
	*Fast = !Subtarget.isUnalignedMem32Slow();
	break;
	// TODO: What about AVX-512 (512-bit) accesses?
	}
	}
	// Misaligned accesses of any size are always allowed.
	return true;
	}

	/// Return the entry encoding for a jump table in the
	/// current function. The returned value is a member of the
	/// MachineJumpTableInfo::JTEntryKind enum.
	unsigned X86TargetLowering::getJumpTableEncoding() const {
	// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
	// symbol.
	if (isPositionIndependent() && Subtarget.isPICStyleGOT())
	return MachineJumpTableInfo::EK_Custom32;

	// Otherwise, use the normal jump table encoding heuristics.
	return TargetLowering::getJumpTableEncoding();
	}

	bool X86TargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const {

	// Only relabel X86-32 for C / Stdcall CCs.
	if (Subtarget.is64Bit())
	return;
	if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
	return;
	unsigned ParamRegs = 0;
	if (auto *M = MF->getFunction().getParent())
	ParamRegs = M->getNumberRegisterParameters();

	// Mark the first N int arguments as having reg
	for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
	Type *T = Args[Idx].Ty;
	if (T->isPointerTy() \|\| T->isIntegerTy())
	if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
	unsigned numRegs = 1;
	if (MF->getDataLayout().getTypeAllocSize(T) > 4)
	numRegs = 2;
	if (ParamRegs < numRegs)
	return;
	ParamRegs -= numRegs;
	Args[Idx].IsInReg = true;
	}
	}
	}

	const MCExpr *
	X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
	const MachineBasicBlock *MBB,
	unsigned uid,MCContext &Ctx) const{
	assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
	// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
	// entries.
	return MCSymbolRefExpr::create(MBB->getSymbol(),
	MCSymbolRefExpr::VK_GOTOFF, Ctx);
	}

	/// Returns relocation base for the given PIC jumptable.
	SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	if (!Subtarget.is64Bit())
	// This doesn't have SDLoc associated with it, but is not really the
	// same as a Register.
	return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()));
	return Table;
	}

	/// This returns the relocation base for the given PIC jumptable,
	/// the same as getPICJumpTableRelocBase, but as an MCExpr.
	const MCExpr *X86TargetLowering::
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
	MCContext &Ctx) const {
	// X86-64 uses RIP relative addressing based on the jump table label.
	if (Subtarget.isPICStyleRIPRel())
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

	// Otherwise, the reference is relative to the PIC base.
	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
	}

	std::pair<const TargetRegisterClass *, uint8_t>
	X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const {
	const TargetRegisterClass *RRC = nullptr;
	uint8_t Cost = 1;
	switch (VT.SimpleTy) {
	default:
	return TargetLowering::findRepresentativeClass(TRI, VT);
	case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
	RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
	break;
	case MVT::x86mmx:
	RRC = &X86::VR64RegClass;
	break;
	case MVT::f32: case MVT::f64:
	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
	case MVT::v4f32: case MVT::v2f64:
	case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
	case MVT::v8f32: case MVT::v4f64:
	case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
	case MVT::v16f32: case MVT::v8f64:
	RRC = &X86::VR128XRegClass;
	break;
	}
	return std::make_pair(RRC, Cost);
	}

	unsigned X86TargetLowering::getAddressSpace() const {
	if (Subtarget.is64Bit())
	return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
	return 256;
	}

	static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
	return TargetTriple.isOSGlibc() \|\| TargetTriple.isOSFuchsia() \|\|
	(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
	}

	static Constant* SegmentOffset(IRBuilder<> &IRB,
	unsigned Offset, unsigned AddressSpace) {
	return ConstantExpr::getIntToPtr(
	ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
	}

	Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// glibc, bionic, and Fuchsia have a special slot for the stack guard in
	// tcbhead_t; use it instead of the usual global variable (see
	// sysdeps/{i386,x86_64}/nptl/tls.h)
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
	return SegmentOffset(IRB, 0x10, getAddressSpace());
	} else {
	// %fs:0x28, unless we're using a Kernel code model, in which case
	// it's %gs:0x28. gs:0x14 on i386.
	unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}
	}

	return TargetLowering::getIRStackGuard(IRB);
	}

	void X86TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget.getTargetTriple().isOSMSVCRT()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	auto *SecurityCheckCookie = cast<Function>(
	M.getOrInsertFunction("__security_check_cookie",
	Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext())));
	SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
	SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
	return;
	}
	// glibc, bionic, and Fuchsia have a special slot for the stack guard.
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
	return;
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget.getTargetTriple().isOSMSVCRT())
	return M.getGlobalVariable("__security_cookie");
	return TargetLowering::getSDagStackGuard(M);
	}

	Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget.getTargetTriple().isOSMSVCRT())
	return M.getFunction("__security_check_cookie");
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	if (Subtarget.getTargetTriple().isOSContiki())
	return getDefaultSafeStackPointerLocation(IRB, false);

	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget.isTargetAndroid()) {
	// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
	// %gs:0x24 on i386
	unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}

	// Fuchsia is similar.
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
	return SegmentOffset(IRB, 0x18, getAddressSpace());
	}

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
	unsigned DestAS) const {
	assert(SrcAS != DestAS && "Expected different address spaces!");

	return SrcAS < 256 && DestAS < 256;
	}

	//===----------------------------------------------------------------------===//
	// Return Value Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	#include "X86GenCallingConv.inc"

	bool X86TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC_X86);
	}

	const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
	static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
	return ScratchRegs;
	}

	/// Lowers masks values (v*i1) to the local register values
	/// \returns DAG node after lowering to register type
	static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
	const SDLoc &Dl, SelectionDAG &DAG) {
	EVT ValVT = ValArg.getValueType();

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
	DAG.getIntPtrConstant(0, Dl));

	if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 \|\| ValLoc == MVT::i32)) \|\|
	(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 \|\| ValLoc == MVT::i32))) {
	// Two stage lowering might be required
	// bitcast: v8i1 -> i8 / v16i1 -> i16
	// anyextend: i8 -> i32 / i16 -> i32
	EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
	SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
	if (ValLoc == MVT::i32)
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
	return ValToCopy;
	} else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) \|\|
	(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
	// One stage lowering is required
	// bitcast: v32i1 -> i32 / v64i1 -> i64
	return DAG.getBitcast(ValLoc, ValArg);
	} else
	return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
	}

	/// Breaks v64i1 value into two registers and adds the new node to the DAG
	static void Passv64i1ArgInRegs(
	const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
	CCValAssign &NextVA, const X86Subtarget &Subtarget) {
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The value should reside in two registers");

	// Before splitting the value we cast it to i64
	Arg = DAG.getBitcast(MVT::i64, Arg);

	// Splitting the value into two i32 types
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(0, Dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(1, Dl, MVT::i32));

	// Attach the two i32 types into corresponding registers
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
	RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
	}

	SDValue
	X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	// In some cases we need to disable registers from the default CSR list.
	// For example, when they are used for argument passing.
	bool ShouldDisableCalleeSavedRegister =
	CallConv == CallingConv::X86_RegCall \|\|
	MF.getFunction().hasFnAttribute("no_caller_saved_registers");

	if (CallConv == CallingConv::X86_INTR && !Outs.empty())
	report_fatal_error("X86 interrupts may not return any value");

	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC_X86);

	SDValue Flag;
	SmallVector<SDValue, 6> RetOps;
	RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
	// Operand #1 = Bytes To Pop
	RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
	MVT::i32));

	// Copy the result values into the output registers.
	for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = RVLocs[I];
	assert(VA.isRegLoc() && "Can only return in registers!");

	// Add the register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

	SDValue ValToCopy = OutVals[OutsIndex];
	EVT ValVT = ValToCopy.getValueType();

	// Promote values to the appropriate types.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::AExt) {
	if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
	ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
	else
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
	}
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

	assert(VA.getLocInfo() != CCValAssign::FPExt &&
	"Unexpected FP-extend for return value.");

	// If this is x86-64, and we disabled SSE, we can't return FP values,
	// or SSE or MMX vectors.
	if ((ValVT == MVT::f32 \|\| ValVT == MVT::f64 \|\|
	VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	} else if (ValVT == MVT::f64 &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
	// Likewise we can't return F64 values with SSE1 only. gcc does so, but
	// llvm-gcc has never done it right and no one has noticed, so this
	// should be OK for now.
	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// Returns in ST0/ST1 are handled specially: these are pushed as operands to
	// the RET instruction and handled by the FP Stackifier.
	if (VA.getLocReg() == X86::FP0 \|\|
	VA.getLocReg() == X86::FP1) {
	// If this is a copy from an xmm register to ST(0), use an FPExtend to
	// change the value to the FP stack register class.
	if (isScalarFPTypeInSSEReg(VA.getValVT()))
	ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
	RetOps.push_back(ValToCopy);
	// Don't emit a copytoreg.
	continue;
	}

	// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
	// which is returned in RAX / RDX.
	if (Subtarget.is64Bit()) {
	if (ValVT == MVT::x86mmx) {
	if (VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) {
	ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
	ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	ValToCopy);
	// If we don't have SSE2 available, convert to v4f32 so the generated
	// register is legal.
	if (!Subtarget.hasSSE2())
	ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
	}
	}
	}

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
	Subtarget);

	assert(2 == RegsToPass.size() &&
	"Expecting two registers after Pass64BitArgInRegs");

	// Add the second register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
	} else {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
	}

	// Add nodes to the DAG and add the values into the RetOps list
	for (auto &Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
	}
	}

	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

	// All x86 ABIs require that for returning structs by value we copy
	// the sret argument into %rax/%eax (depending on ABI) for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into %rax/%eax.
	//
	// Checking Function.hasStructRetAttr() here is insufficient because the IR
	// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
	// false, then an sret argument may be implicitly inserted in the SelDAG. In
	// either case FuncInfo->setSRetReturnReg() will have been called.
	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
	// When we have both sret and another return value, we should use the
	// original Chain stored in RetOps[0], instead of the current Chain updated
	// in the above loop. If we only have sret, RetOps[0] equals to Chain.

	// For the case of sret and another return value, we have
	// Chain_0 at the function entry
	// Chain_1 = getCopyToReg(Chain_0) in the above loop
	// If we use Chain_1 in getCopyFromReg, we will have
	// Val = getCopyFromReg(Chain_1)
	// Chain_2 = getCopyToReg(Chain_1, Val) from below

	// getCopyToReg(Chain_0) will be glued together with
	// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
	// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
	// Data dependency from Unit B to Unit A due to usage of Val in
	// getCopyToReg(Chain_1, Val)
	// Chain dependency from Unit A to Unit B

	// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
	SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
	getPointerTy(MF.getDataLayout()));

	unsigned RetValReg
	= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
	X86::RAX : X86::EAX;
	Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	// RAX/EAX now acts like a return value.
	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

	// Add the returned register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
	}

	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (X86::GR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	X86ISD::NodeType opcode = X86ISD::RET_FLAG;
	if (CallConv == CallingConv::X86_INTR)
	opcode = X86ISD::IRET;
	return DAG.getNode(opcode, dl, MVT::Other, RetOps);
	}

	bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
	if (N->getNumValues() != 1 \|\| !N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() != X86ISD::RET_FLAG)
	return false;
	// If we are returning more than one value, we can definitely
	// not make a tail call see PR19530
	if (UI->getNumOperands() > 4)
	return false;
	if (UI->getNumOperands() == 4 &&
	UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType ExtendKind) const {
	MVT ReturnMVT = MVT::i32;

	bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
	if (VT == MVT::i1 \|\| (!Darwin && (VT == MVT::i8 \|\| VT == MVT::i16))) {
	// The ABI does not require i1, i8 or i16 to be extended.
	//
	// On Darwin, there is code in the wild relying on Clang's old behaviour of
	// always extending i8/i16 return values, so keep doing that for now.
	// (PR26665).
	ReturnMVT = MVT::i8;
	}

	EVT MinVT = getRegisterType(Context, ReturnMVT);
	return VT.bitsLT(MinVT) ? MinVT : VT;
	}

	/// Reads two 32 bit registers and creates a 64 bit mask value.
	/// \param VA The current 32 bit value that need to be assigned.
	/// \param NextVA The next 32 bit value that need to be assigned.
	/// \param Root The parent DAG node.
	/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
	/// glue purposes. In the case the DAG is already using
	/// physical register instead of virtual, we should glue
	/// our new SDValue to InFlag SDvalue.
	/// \return a new SDvalue of size 64bit.
	static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
	SDValue &Root, SelectionDAG &DAG,
	const SDLoc &Dl, const X86Subtarget &Subtarget,
	SDValue *InFlag = nullptr) {
	assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(VA.getValVT() == MVT::v64i1 &&
	"Expecting first location of 64 bit width type");
	assert(NextVA.getValVT() == VA.getValVT() &&
	"The locations should have the same type");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The values should reside in two registers");

	SDValue Lo, Hi;
	unsigned Reg;
	SDValue ArgValueLo, ArgValueHi;

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetRegisterClass *RC = &X86::GR32RegClass;

	// Read a 32 bit value from the registers
	if (nullptr == InFlag) {
	// When no physical register is present,
	// create an intermediate virtual register
	Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
	ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	} else {
	// When a physical register is available read the value from it and glue
	// the reads together.
	ArgValueLo =
	DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueLo.getValue(2);
	ArgValueHi =
	DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueHi.getValue(2);
	}

	// Convert the i32 type into v32i1 type
	Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

	// Convert the i32 type into v32i1 type
	Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

	// Concatenate the two values together
	return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
	}

	/// The function will lower a register of various sizes (8/16/32/64)
	/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
	/// \returns a DAG node contains the operand after lowering to mask type.
	static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
	const EVT &ValLoc, const SDLoc &Dl,
	SelectionDAG &DAG) {
	SDValue ValReturned = ValArg;

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

	if (ValVT == MVT::v64i1) {
	// In 32 bit machine, this case is handled by getv64i1Argument
	assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
	// In 64 bit machine, There is no need to truncate the value only bitcast
	} else {
	MVT maskLen;
	switch (ValVT.getSimpleVT().SimpleTy) {
	case MVT::v8i1:
	maskLen = MVT::i8;
	break;
	case MVT::v16i1:
	maskLen = MVT::i16;
	break;
	case MVT::v32i1:
	maskLen = MVT::i32;
	break;
	default:
	llvm_unreachable("Expecting a vector of i1 types");
	}

	ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
	}
	return DAG.getBitcast(ValVT, ValReturned);
	}

	/// Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	///
	SDValue X86TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	uint32_t *RegMask) const {

	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	bool Is64Bit = Subtarget.is64Bit();
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++InsIndex) {
	CCValAssign &VA = RVLocs[I];
	EVT CopyVT = VA.getLocVT();

	// In some calling conventions we need to remove the used registers
	// from the register mask.
	if (RegMask) {
	for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));
	}

	// If this is x86-64, and we disabled SSE, we can't return FP values
	if ((CopyVT == MVT::f32 \|\| CopyVT == MVT::f64 \|\| CopyVT == MVT::f128) &&
	((Is64Bit \|\| Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// If we prefer to use the value in xmm registers, copy it out as f80 and
	// use a truncate to move it from fp stack reg to xmm reg.
	bool RoundAfterCopy = false;
	if ((VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1) &&
	isScalarFPTypeInSSEReg(VA.getValVT())) {
	if (!Subtarget.hasX87())
	report_fatal_error("X87 register return with X87 disabled");
	CopyVT = MVT::f80;
	RoundAfterCopy = (CopyVT != VA.getLocVT());
	}

	SDValue Val;
	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	Val =
	getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
	} else {
	Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
	.getValue(1);
	Val = Chain.getValue(0);
	InFlag = Chain.getValue(2);
	}

	if (RoundAfterCopy)
	Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
	// This truncation won't change the value.
	DAG.getIntPtrConstant(1, dl));

	if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
	if (VA.getValVT().isVector() &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
	} else
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// C & StdCall & Fast Calling Convention implementation
	//===----------------------------------------------------------------------===//
	// StdCall calling convention seems to be standard for many Windows' API
	// routines and around. It differs from C calling convention just a little:
	// callee should clean up the stack, not caller. Symbols should be also
	// decorated in some fancy way :) It doesn't support any vector arguments.
	// For info on fast calling convention see Fast Calling Convention (tail call)
	// implementation LowerX86_32FastCCCallTo.

	/// CallIsStructReturn - Determines whether a call uses struct return
	/// semantics.
	enum StructReturnType {
	NotStructReturn,
	RegStructReturn,
	StackStructReturn
	};
	static StructReturnType
	callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
	if (Outs.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Determines whether a function uses struct return semantics.
	static StructReturnType
	argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
	if (Ins.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Make a copy of an aggregate at address specified by "Src" to address
	/// "Dst" with size and alignment information specified by the specific
	/// parameter attribute. The copy will be passed as a byval function parameter.
	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
	SDValue Chain, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) {
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);

	return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
	/isVolatile/false, /AlwaysInline=/true,
	/isTailCall/false,
	MachinePointerInfo(), MachinePointerInfo());
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
	CC == CallingConv::X86_RegCall \|\| CC == CallingConv::HiPE \|\|
	CC == CallingConv::HHVM);
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	// C calling conventions:
	case CallingConv::C:
	case CallingConv::Win64:
	case CallingConv::X86_64_SysV:
	// Callee pop conventions:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_VectorCall:
	case CallingConv::X86_FastCall:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	/// Return true if the function is being made into a tailcall target by
	/// changing its ABI.
	static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
	return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
	}

	bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	auto Attr =
	CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
	if (!CI->isTailCall() \|\| Attr.getValueAsString() == "true")
	return false;

	ImmutableCallSite CS(CI);
	CallingConv::ID CalleeCC = CS.getCallingConv();
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	return true;
	}

	SDValue
	X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA,
	MachineFrameInfo &MFI, unsigned i) const {
	// Create the nodes corresponding to a load from this parameter slot.
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	bool AlwaysUseMutable = shouldGuaranteeTCO(
	CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
	bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
	EVT ValVT;
	MVT PtrVT = getPointerTy(DAG.getDataLayout());

	// If value is passed by pointer we have address passed instead of the value
	// itself. No need to extend if the mask value and location share the same
	// absolute size.
	bool ExtendedInMem =
	VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
	VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

	if (VA.getLocInfo() == CCValAssign::Indirect \|\| ExtendedInMem)
	ValVT = VA.getLocVT();
	else
	ValVT = VA.getValVT();

	// Calculate SP offset of interrupt parameter, re-arrange the slot normally
	// taken by a return address.
	int Offset = 0;
	if (CallConv == CallingConv::X86_INTR) {
	// X86 interrupts may take one or two arguments.
	// On the stack there will be no return address as in regular call.
	// Offset of last argument need to be set to -4/-8 bytes.
	// Where offset of the first argument out of two, should be set to 0 bytes.
	Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
	if (Subtarget.is64Bit() && Ins.size() == 2) {
	// The stack pointer needs to be realigned for 64 bit handlers with error
	// code, so the argument offset changes by 8 bytes.
	Offset += 8;
	}
	}

	// FIXME: For now, all byval parameter objects are marked mutable. This can be
	// changed with more analysis.
	// In case of tail call optimization mark all arguments mutable. Since they
	// could be overwritten by lowering of arguments in case of a tail call.
	if (Flags.isByVal()) {
	unsigned Bytes = Flags.getByValSize();
	if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
	int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
	// Adjust SP offset of interrupt parameter.
	if (CallConv == CallingConv::X86_INTR) {
	MFI.setObjectOffset(FI, Offset);
	}
	return DAG.getFrameIndex(FI, PtrVT);
	}

	// This is an argument in memory. We might be able to perform copy elision.
	if (Flags.isCopyElisionCandidate()) {
	EVT ArgVT = Ins[i].ArgVT;
	SDValue PartAddr;
	if (Ins[i].PartOffset == 0) {
	// If this is a one-part value or the first part of a multi-part value,
	// create a stack object for the entire argument value type and return a
	// load from our portion of it. This assumes that if the first part of an
	// argument is in memory, the rest will also be in memory.
	int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
	/Immutable=/false);
	PartAddr = DAG.getFrameIndex(FI, PtrVT);
	return DAG.getLoad(
	ValVT, dl, Chain, PartAddr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	} else {
	// This is not the first piece of an argument in memory. See if there is
	// already a fixed stack object including this offset. If so, assume it
	// was created by the PartOffset == 0 branch above and create a load from
	// the appropriate offset into it.
	int64_t PartBegin = VA.getLocMemOffset();
	int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
	int FI = MFI.getObjectIndexBegin();
	for (; MFI.isFixedObjectIndex(FI); ++FI) {
	int64_t ObjBegin = MFI.getObjectOffset(FI);
	int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
	if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
	break;
	}
	if (MFI.isFixedObjectIndex(FI)) {
	SDValue Addr =
	DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
	DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
	return DAG.getLoad(
	ValVT, dl, Chain, Addr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
	Ins[i].PartOffset));
	}
	}
	}

	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), isImmutable);

	// Set SExt or ZExt flag.
	if (VA.getLocInfo() == CCValAssign::ZExt) {
	MFI.setObjectZExt(FI, true);
	} else if (VA.getLocInfo() == CCValAssign::SExt) {
	MFI.setObjectSExt(FI, true);
	}

	// Adjust SP offset of interrupt parameter.
	if (CallConv == CallingConv::X86_INTR) {
	MFI.setObjectOffset(FI, Offset);
	}

	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val = DAG.getLoad(
	ValVT, dl, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	return ExtendedInMem
	? (VA.getValVT().isVector()
	? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
	: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
	: Val;
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());

	if (Subtarget.isCallingConvWin64(CallConv)) {
	static const MCPhysReg GPR64ArgRegsWin64[] = {
	X86::RCX, X86::RDX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
	}

	static const MCPhysReg GPR64ArgRegs64Bit[] = {
	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
	CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());
	if (Subtarget.isCallingConvWin64(CallConv)) {
	// The XMM registers which might contain var arg parameters are shadowed
	// in their paired GPR. So we only need to save the GPR to their home
	// slots.
	// TODO: __vectorcall will change this.
	return None;
	}

	const Function &F = MF.getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool isSoftFloat = Subtarget.useSoftFloat();
	assert(!(isSoftFloat && NoImplicitFloatOps) &&
	"SSE register cannot be used when SSE is disabled!");
	if (isSoftFloat \|\| NoImplicitFloatOps \|\| !Subtarget.hasSSE1())
	// Kernel mode asks for SSE to be disabled, so there are no XMM argument
	// registers.
	return None;

	static const MCPhysReg XMMArgRegs64Bit[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
	}

	#ifndef NDEBUG
	static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
	return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
	[](const CCValAssign &A, const CCValAssign &B) -> bool {
	return A.getValNo() < B.getValNo();
	});
	}
	#endif

	SDValue X86TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

	const Function &F = MF.getFunction();
	if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
	F.getName() == "main")
	FuncInfo->setForceFramePointer(true);

	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

	assert(
	!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");

	if (CallConv == CallingConv::X86_INTR) {
	bool isLegal = Ins.size() == 1 \|\|
	(Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) \|\|
	(!Is64Bit && Ins[1].VT == MVT::i32)));
	if (!isLegal)
	report_fatal_error("X86 interrupts may take one or two arguments");
	}

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Ins, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
	}

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	SDValue ArgValue;
	for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++InsIndex) {
	assert(InsIndex < Ins.size() && "Invalid Ins index");
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	EVT RegVT = VA.getLocVT();
	if (VA.needsCustom()) {
	assert(
	VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	// v64i1 values, in regcall calling convention, that are
	// compiled to 32 bit arch, are split up into two registers.
	ArgValue =
	getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
	} else {
	const TargetRegisterClass *RC;
	if (RegVT == MVT::i32)
	RC = &X86::GR32RegClass;
	else if (Is64Bit && RegVT == MVT::i64)
	RC = &X86::GR64RegClass;
	else if (RegVT == MVT::f32)
	RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
	else if (RegVT == MVT::f64)
	RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
	else if (RegVT == MVT::f80)
	RC = &X86::RFP80RegClass;
	else if (RegVT == MVT::f128)
	RC = &X86::FR128RegClass;
	else if (RegVT.is512BitVector())
	RC = &X86::VR512RegClass;
	else if (RegVT.is256BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
	else if (RegVT.is128BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
	else if (RegVT == MVT::x86mmx)
	RC = &X86::VR64RegClass;
	else if (RegVT == MVT::v1i1)
	RC = &X86::VK1RegClass;
	else if (RegVT == MVT::v8i1)
	RC = &X86::VK8RegClass;
	else if (RegVT == MVT::v16i1)
	RC = &X86::VK16RegClass;
	else if (RegVT == MVT::v32i1)
	RC = &X86::VK32RegClass;
	else if (RegVT == MVT::v64i1)
	RC = &X86::VK64RegClass;
	else
	llvm_unreachable("Unknown argument type!");

	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
	}

	// If this is an 8 or 16-bit value, it is really passed promoted to 32
	// bits. Insert an assert[sz]ext to capture this, then truncate to the
	// right size.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

	if (VA.isExtInLoc()) {
	// Handle MMX values passed in XMM regs.
	if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
	ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
	else if (VA.getValVT().isVector() &&
	VA.getValVT().getScalarType() == MVT::i1 &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
	} else
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	}
	} else {
	assert(VA.isMemLoc());
	ArgValue =
	LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
	}

	// If value is passed via pointer - do a load.
	if (VA.getLocInfo() == CCValAssign::Indirect)
	ArgValue =
	DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

	InVals.push_back(ArgValue);
	}

	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
	if (CallConv == CallingConv::Swift)
	continue;

	// All x86 ABIs require that for returning structs by value we copy the
	// sret argument into %rax/%eax (depending on ABI) for the return. Save
	// the argument into a virtual register so that we can access it from the
	// return points.
	if (Ins[I].Flags.isSRet()) {
	unsigned Reg = FuncInfo->getSRetReturnReg();
	if (!Reg) {
	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);
	}
	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
	break;
	}
	}

	unsigned StackSize = CCInfo.getNextStackOffset();
	// Align stack specially for tail calls.
	if (shouldGuaranteeTCO(CallConv,
	MF.getTarget().Options.GuaranteedTailCallOpt))
	StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start. We
	// can skip this if there are no va_start calls.
	if (MFI.hasVAStart() &&
	(Is64Bit \|\| (CallConv != CallingConv::X86_FastCall &&
	CallConv != CallingConv::X86_ThisCall))) {
	FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
	}

	// Figure out if XMM registers are in use.
	assert(!(Subtarget.useSoftFloat() &&
	F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
	"SSE register cannot be used when SSE is disabled!");

	// 64-bit calling conventions support varargs and register parameters, so we
	// have to do extra work to spill them in the prologue.
	if (Is64Bit && isVarArg && MFI.hasVAStart()) {
	// Find the first unallocated argument registers.
	ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
	ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
	unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
	assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
	"SSE register cannot be used when SSE is disabled!");

	// Gather all the live in physical registers.
	SmallVector<SDValue, 6> LiveGPRs;
	SmallVector<SDValue, 8> LiveXMMRegs;
	SDValue ALVal;
	for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
	unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
	LiveGPRs.push_back(
	DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
	}
	if (!ArgXMMs.empty()) {
	unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
	for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
	unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
	LiveXMMRegs.push_back(
	DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
	}
	}

	if (IsWin64) {
	// Get to the caller-allocated home save location. Add 8 to account
	// for the return address.
	int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
	FuncInfo->setRegSaveFrameIndex(
	MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
	// Fixup to set vararg frame on shadow area (4 x i64).
	if (NumIntRegs < 4)
	FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
	} else {
	// For X86-64, if there are vararg parameters that are passed via
	// registers, then we must store them to their spots on the stack so
	// they may be loaded by dereferencing the result of va_next.
	FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
	FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
	FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
	ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
	}

	// Store the integer parameter registers.
	SmallVector<SDValue, 8> MemOps;
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
	getPointerTy(DAG.getDataLayout()));
	unsigned Offset = FuncInfo->getVarArgsGPOffset();
	for (SDValue Val : LiveGPRs) {
	SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	RSFIN, DAG.getIntPtrConstant(Offset, dl));
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(),
	FuncInfo->getRegSaveFrameIndex(), Offset));
	MemOps.push_back(Store);
	Offset += 8;
	}

	if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
	// Now store the XMM (fp + vector) parameter registers.
	SmallVector<SDValue, 12> SaveXMMOps;
	SaveXMMOps.push_back(Chain);
	SaveXMMOps.push_back(ALVal);
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getRegSaveFrameIndex(), dl));
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getVarArgsFPOffset(), dl));
	SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
	LiveXMMRegs.end());
	MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
	MVT::Other, SaveXMMOps));
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
	}

	if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
	// Find the largest legal vector type.
	MVT VecVT = MVT::Other;
	// FIXME: Only some x86_32 calling conventions support AVX512.
	if (Subtarget.hasAVX512() &&
	(Is64Bit \|\| (CallConv == CallingConv::X86_VectorCall \|\|
	CallConv == CallingConv::Intel_OCL_BI)))
	VecVT = MVT::v16f32;
	else if (Subtarget.hasAVX())
	VecVT = MVT::v8f32;
	else if (Subtarget.hasSSE2())
	VecVT = MVT::v4f32;

	// We forward some GPRs and some vector types.
	SmallVector<MVT, 2> RegParmTypes;
	MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
	RegParmTypes.push_back(IntVT);
	if (VecVT != MVT::Other)
	RegParmTypes.push_back(VecVT);

	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

	// Conservatively forward AL on x86_64, since it might be used for varargs.
	if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
	unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
	}

	// Copy all forwards from physical to virtual registers.
	for (ForwardedRegister &F : Forwards) {
	// FIXME: Can we use a less constrained schedule?
	SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
	Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
	}
	}

	// Some CCs need callee pop.
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt)) {
	FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
	} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
	// X86 interrupts must pop the error code (and the alignment padding) if
	// present.
	FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
	} else {
	FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
	// If this is an sret function, the return should pop the hidden pointer.
	if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
	FuncInfo->setBytesToPopOnReturn(4);
	}

	if (!Is64Bit) {
	// RegSaveFrameIndex is X86-64 only.
	FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
	if (CallConv == CallingConv::X86_FastCall \|\|
	CallConv == CallingConv::X86_ThisCall)
	// fastcc functions can't have varargs.
	FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
	}

	FuncInfo->setArgumentStackSize(StackSize);

	if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
	EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
	if (Personality == EHPersonality::CoreCLR) {
	assert(Is64Bit);
	// TODO: Add a mechanism to frame lowering that will allow us to indicate
	// that we'd prefer this slot be allocated towards the bottom of the frame
	// (i.e. near the stack pointer after allocating the frame). Every
	// funclet needs a copy of this slot in its (mostly empty) frame, and the
	// offset from the bottom of this and each funclet's frame must be the
	// same, so the size of funclets' (mostly empty) frames is dictated by
	// how far this slot is from the bottom (since they allocate just enough
	// space to accommodate holding this slot at the correct offset).
	int PSPSymFI = MFI.CreateStackObject(8, 8, /isSS=/false);
	EHInfo->PSPSymFrameIdx = PSPSymFI;
	}
	}

	if (CallConv == CallingConv::X86_RegCall \|\|
	F.hasFnAttribute("no_caller_saved_registers")) {
	MachineRegisterInfo &MRI = MF.getRegInfo();
	for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
	MRI.disableCalleeSavedRegister(Pair.first);
	}

	return Chain;
	}

	SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
	SDValue Arg, const SDLoc &dl,
	SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags) const {
	unsigned LocMemOffset = VA.getLocMemOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, PtrOff);
	if (Flags.isByVal())
	return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

	return DAG.getStore(
	Chain, dl, Arg, PtrOff,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
	}

	/// Emit a load of return address if tail call
	/// optimization is performed and it is required.
	SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
	SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
	bool Is64Bit, int FPDiff, const SDLoc &dl) const {
	// Adjust the Return address stack slot.
	EVT VT = getPointerTy(DAG.getDataLayout());
	OutRetAddr = getReturnAddressFrameIndex(DAG);

	// Load the "old" Return address.
	OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
	return SDValue(OutRetAddr.getNode(), 1);
	}

	/// Emit a store of the return address if tail call
	/// optimization is performed and it is required (FPDiff!=0).
	static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
	SDValue Chain, SDValue RetAddrFrIdx,
	EVT PtrVT, unsigned SlotSize,
	int FPDiff, const SDLoc &dl) {
	// Store the return address to the appropriate stack slot.
	if (!FPDiff) return Chain;
	// Calculate the new stack slot for the return address.
	int NewReturnAddrFI =
	MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
	false);
	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
	Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), NewReturnAddrFI));
	return Chain;
	}

	/// Returns a vector_shuffle mask for an movs{s\|d}, movd
	/// operation of specified width.
	static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
	SDValue V2) {
	unsigned NumElems = VT.getVectorNumElements();
	SmallVector<int, 8> Mask;
	Mask.push_back(NumElems);
	for (unsigned i = 1; i != NumElems; ++i)
	Mask.push_back(i);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	SDValue
	X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	CallingConv::ID CallConv = CLI.CallConv;
	bool &isTailCall = CLI.IsTailCall;
	bool isVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
	StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
	bool IsSibcall = false;
	X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
	auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
	const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
	const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
	bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) \|\|
	(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));

	if (CallConv == CallingConv::X86_INTR)
	report_fatal_error("X86 interrupts may not be called directly");

	if (Attr.getValueAsString() == "true")
	isTailCall = false;

	if (Subtarget.isPICStyleGOT() &&
	!MF.getTarget().Options.GuaranteedTailCallOpt) {
	// If we are using a GOT, disable tail calls to external symbols with
	// default visibility. Tail calling such a symbol requires using a GOT
	// relocation, which forces early binding of the symbol. This breaks code
	// that require lazy function symbol resolution. Using musttail or
	// GuaranteedTailCallOpt will override this.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (!G \|\| (!G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility()))
	isTailCall = false;
	}

	bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
	if (IsMustTail) {
	// Force this to be a tail call. The verifier rules are enough to ensure
	// that we can lower this successfully without moving the return address
	// around.
	isTailCall = true;
	} else if (isTailCall) {
	// Check if it's really possible to do a tail call.
	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
	isVarArg, SR != NotStructReturn,
	MF.getFunction().hasStructRetAttr(), CLI.RetTy,
	Outs, OutVals, Ins, DAG);

	// Sibcalls are automatically detected tailcalls which do not require
	// ABI changes.
	if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
	IsSibcall = true;

	if (isTailCall)
	++NumTailCalls;
	}

	assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling convention fastcc, ghc or hipe");

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Outs, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
	if (IsSibcall)
	// This is a sibcall. The memory operands are available in caller's
	// own caller's stack.
	NumBytes = 0;
	else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
	canGuaranteeTCO(CallConv))
	NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

	int FPDiff = 0;
	if (isTailCall && !IsSibcall && !IsMustTail) {
	// Lower arguments at fp - stackoffset + fpdiff.
	unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

	FPDiff = NumBytesCallerPushed - NumBytes;

	// Set the delta of movement of the returnaddr stackslot.
	// But only set if delta is greater than previous delta.
	if (FPDiff < X86Info->getTCReturnAddrDelta())
	X86Info->setTCReturnAddrDelta(FPDiff);
	}

	unsigned NumBytesToPush = NumBytes;
	unsigned NumBytesToPop = NumBytes;

	// If we have an inalloca argument, all stack space has already been allocated
	// for us and be right at the top of the stack. We don't support multiple
	// arguments passed in memory when using inalloca.
	if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
	NumBytesToPush = 0;
	if (!ArgLocs.back().isMemLoc())
	report_fatal_error("cannot use inalloca attribute on a register "
	"parameter");
	if (ArgLocs.back().getLocMemOffset() != 0)
	report_fatal_error("any parameter with the inalloca attribute must be "
	"the only memory argument");
	}

	if (!IsSibcall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
	NumBytes - NumBytesToPush, dl);

	SDValue RetAddrFrIdx;
	// Load return address for tail calls.
	if (isTailCall && FPDiff)
	Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
	Is64Bit, FPDiff, dl);

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	// Walk the register/memloc assignments, inserting copies/loads. In the case
	// of tail call optimization arguments are handle later.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutIndex) {
	assert(OutIndex < Outs.size() && "Invalid Out index");
	// Skip inalloca arguments, they have already been written.
	ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
	if (Flags.isInAlloca())
	continue;

	CCValAssign &VA = ArgLocs[I];
	EVT RegVT = VA.getLocVT();
	SDValue Arg = OutVals[OutIndex];
	bool isByVal = Flags.isByVal();

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::AExt:
	if (Arg.getValueType().isVector() &&
	Arg.getValueType().getVectorElementType() == MVT::i1)
	Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
	else if (RegVT.is128BitVector()) {
	// Special case: passing MMX values in XMM registers.
	Arg = DAG.getBitcast(MVT::i64, Arg);
	Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
	Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
	} else
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getBitcast(RegVT, Arg);
	break;
	case CCValAssign::Indirect: {
	// Store the argument.
	SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	Chain = DAG.getStore(
	Chain, dl, Arg, SpillSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	Arg = SpillSlot;
	break;
	}
	}

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	// Split v64i1 value into two registers
	Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
	Subtarget);
	} else if (VA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	if (isVarArg && IsWin64) {
	// Win64 ABI requires argument XMM reg to be copied to the corresponding
	// shadow reg if callee is a varargs function.
	unsigned ShadowReg = 0;
	switch (VA.getLocReg()) {
	case X86::XMM0: ShadowReg = X86::RCX; break;
	case X86::XMM1: ShadowReg = X86::RDX; break;
	case X86::XMM2: ShadowReg = X86::R8; break;
	case X86::XMM3: ShadowReg = X86::R9; break;
	}
	if (ShadowReg)
	RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
	}
	} else if (!IsSibcall && (!isTailCall \|\| isByVal)) {
	assert(VA.isMemLoc());
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
	dl, DAG, VA, Flags));
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	if (Subtarget.isPICStyleGOT()) {
	// ELF / PIC requires GOT in the EBX register before function calls via PLT
	// GOT pointer.
	if (!isTailCall) {
	RegsToPass.push_back(std::make_pair(
	unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()))));
	} else {
	// If we are tail calling and generating PIC/GOT style code load the
	// address of the callee into ECX. The value in ecx is used as target of
	// the tail jump. This is done to circumvent the ebx/callee-saved problem
	// for tail calls on PIC/GOT architectures. Normally we would just put the
	// address of GOT into ebx and then call target@PLT. But for tail calls
	// ebx would be restored (since ebx is callee saved) before jumping to the
	// target@PLT.

	// Note: The actual moving to ECX is done further down.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (G && !G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility())
	Callee = LowerGlobalAddress(Callee, DAG);
	else if (isa<ExternalSymbolSDNode>(Callee))
	Callee = LowerExternalSymbol(Callee, DAG);
	}
	}

	if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
	// From AMD64 ABI document:
	// For calls that may call functions that use varargs or stdargs
	// (prototype-less calls or calls to functions containing ellipsis (...) in
	// the declaration) %al is used as hidden argument to specify the number
	// of SSE registers used. The contents of %al do not need to match exactly
	// the number of registers, but must be an ubound on the number of SSE
	// registers used and is in the range 0 - 8 inclusive.

	// Count the number of XMM registers allocated.
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
	assert((Subtarget.hasSSE1() \|\| !NumXMMRegs)
	&& "SSE registers cannot be used when SSE is disabled");

	RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
	DAG.getConstant(NumXMMRegs, dl,
	MVT::i8)));
	}

	if (isVarArg && IsMustTail) {
	const auto &Forwards = X86Info->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
	}
	}

	// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
	// don't need this because the eligibility check rejects calls that require
	// shuffling arguments passed in memory.
	if (!IsSibcall && isTailCall) {
	// Force all the incoming stack arguments to be loaded from the stack
	// before any new outgoing arguments are stored to the stack, because the
	// outgoing stack slots may alias the incoming argument stack slots, and
	// the alias isn't otherwise explicit. This is slightly more conservative
	// than necessary, because it means that each store effectively depends
	// on every argument instead of just those arguments it would clobber.
	SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

	SmallVector<SDValue, 8> MemOpChains2;
	SDValue FIN;
	int FI = 0;
	for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	if (VA.needsCustom()) {
	assert((CallConv == CallingConv::X86_RegCall) &&
	"Expecting custom case only in regcall calling convention");
	// This means that we are in special case where one argument was
	// passed through two register locations - Skip the next location
	++I;
	}

	continue;
	}

	assert(VA.isMemLoc());
	SDValue Arg = OutVals[OutsIndex];
	ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
	// Skip inalloca arguments. They don't require any work.
	if (Flags.isInAlloca())
	continue;
	// Create frame index.
	int32_t Offset = VA.getLocMemOffset()+FPDiff;
	uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
	FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
	FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

	if (Flags.isByVal()) {
	// Copy relative to framepointer.
	SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, Source);

	MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
	ArgChain,
	Flags, DAG, dl));
	} else {
	// Store relative to framepointer.
	MemOpChains2.push_back(DAG.getStore(
	ArgChain, dl, Arg, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
	}
	}

	if (!MemOpChains2.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

	// Store the return address to the appropriate stack slot.
	Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
	getPointerTy(DAG.getDataLayout()),
	RegInfo->getSlotSize(), FPDiff, dl);
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into registers.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
	assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
	// In the 64-bit large code model, we have to make all calls
	// through a register, since the call instruction's 32-bit
	// pc-relative offset may not be large enough to hold the whole
	// address.
	} else if (Callee->getOpcode() == ISD::GlobalAddress) {
	// If the callee is a GlobalAddress node (quite common, every direct call
	// is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
	// it.
	GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);

	// We should use extra load for direct calls to dllimported functions in
	// non-JIT mode.
	const GlobalValue *GV = G->getGlobal();
	if (!GV->hasDLLImportStorageClass()) {
	unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);

	Callee = DAG.getTargetGlobalAddress(
	GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);

	if (OpFlags == X86II::MO_GOTPCREL) {
	// Add a wrapper.
	Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
	getPointerTy(DAG.getDataLayout()), Callee);
	// Add extra indirection
	Callee = DAG.getLoad(
	getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}
	}
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
	unsigned char OpFlags =
	Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);

	Callee = DAG.getTargetExternalSymbol(
	S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
	} else if (Subtarget.isTarget64BitILP32() &&
	Callee->getValueType(0) == MVT::i32) {
	// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
	Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
	}

	// Returns a chain & a flag for retval copy to use.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 8> Ops;

	if (!IsSibcall && isTailCall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (isTailCall)
	Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
	// set X86_INTR calling convention because it has the same CSR mask
	// (same preserved registers).
	const uint32_t *Mask = RegInfo->getCallPreservedMask(
	MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");

	// If this is an invoke in a 32-bit function using a funclet-based
	// personality, assume the function clobbers all registers. If an exception
	// is thrown, the runtime will not restore CSRs.
	// FIXME: Model this more precisely so that we can register allocate across
	// the normal edge and spill and fill across the exceptional edge.
	if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
	const Function &CallerFn = MF.getFunction();
	EHPersonality Pers =
	CallerFn.hasPersonalityFn()
	? classifyEHPersonality(CallerFn.getPersonalityFn())
	: EHPersonality::Unknown;
	if (isFuncletEHPersonality(Pers))
	Mask = RegInfo->getNoPreservedMask();
	}

	// Define a new register mask from the existing mask.
	uint32_t *RegMask = nullptr;

	// In some calling conventions we need to remove the used physical registers
	// from the reg mask.
	if (CallConv == CallingConv::X86_RegCall \|\| HasNCSR) {
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	// Allocate a new Reg Mask and copy Mask.
	RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
	unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
	memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);

	// Make sure all sub registers of the argument registers are reset
	// in the RegMask.
	for (auto const &RegPair : RegsToPass)
	for (MCSubRegIterator SubRegs(RegPair.first, TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));

	// Create the RegMask Operand according to our updated mask.
	Ops.push_back(DAG.getRegisterMask(RegMask));
	} else {
	// Create the RegMask Operand according to the static mask.
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	if (isTailCall) {
	// We used to do:
	//// If this is the first return lowered for this function, add the regs
	//// to the liveout set for the function.
	// This isn't right, although it's probably harmless on x86; liveouts
	// should be computed from returns not tail calls. Consider a void
	// function making a tail call to a function returning int.
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
	}

	Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
	InFlag = Chain.getValue(1);

	// Create the CALLSEQ_END node.
	unsigned NumBytesForCalleeToPop;
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	DAG.getTarget().Options.GuaranteedTailCallOpt))
	NumBytesForCalleeToPop = NumBytes; // Callee pops everything
	else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	SR == StackStructReturn)
	// If this is a call to a struct-return function, the callee
	// pops the hidden struct pointer, so we have to push it back.
	// This is common for Darwin/X86, Linux & Mingw32 targets.
	// For MSVC Win32 targets, the caller pops the hidden struct pointer.
	NumBytesForCalleeToPop = 4;
	else
	NumBytesForCalleeToPop = 0; // Callee pops nothing.

	if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
	// No need to reset the stack after the call if the call doesn't return. To
	// make the MI verify, we'll pretend the callee does it for us.
	NumBytesForCalleeToPop = NumBytes;
	}

	// Returns a flag for retval copy to use.
	if (!IsSibcall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
	true),
	InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
	InVals, RegMask);
	}

	//===----------------------------------------------------------------------===//
	// Fast Calling Convention (tail call) implementation
	//===----------------------------------------------------------------------===//

	// Like std call, callee cleans arguments, convention except that ECX is
	// reserved for storing the tail called function address. Only 2 registers are
	// free for argument passing (inreg). Tail call optimization is performed
	// provided:
	// * tailcallopt is enabled
	// * caller/callee are fastcc
	// On X86_64 architecture with GOT-style position independent code only local
	// (within module) calls are supported at the moment.
	// To keep the stack aligned according to platform abi the function
	// GetAlignedArgumentStackSize ensures that argument delta is always multiples
	// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
	// If a tail called function callee has more arguments than the caller the
	// caller needs to make sure that there is room to move the RETADDR to. This is
	// achieved by reserving an area the size of the argument delta right after the
	// original RETADDR, but before the saved framepointer or the spilled registers
	// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
	// stack layout:
	// arg1
	// arg2
	// RETADDR
	// [ new RETADDR
	// move area ]
	// (possible EBP)
	// ESI
	// EDI
	// local1 ..

	/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
	/// requirement.
	unsigned
	X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
	SelectionDAG& DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	uint64_t AlignMask = StackAlignment - 1;
	int64_t Offset = StackSize;
	unsigned SlotSize = RegInfo->getSlotSize();
	if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
	// Number smaller than 12 so just add the difference.
	Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
	} else {
	// Mask out lower bits, add stackalignment once plus the 12 bytes.
	Offset = ((~AlignMask) & Offset) + StackAlignment +
	(StackAlignment-SlotSize);
	}
	return Offset;
	}

	/// Return true if the given stack call argument is already available in the
	/// same position (relatively) of the caller's incoming argument stack.
	static
	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
	const X86InstrInfo *TII, const CCValAssign &VA) {
	unsigned Bytes = Arg.getValueSizeInBits() / 8;

	for (;;) {
	// Look through nodes that don't alter the bits of the incoming value.
	unsigned Op = Arg.getOpcode();
	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST) {
	Arg = Arg.getOperand(0);
	continue;
	}
	if (Op == ISD::TRUNCATE) {
	const SDValue &TruncInput = Arg.getOperand(0);
	if (TruncInput.getOpcode() == ISD::AssertZext &&
	cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
	Arg.getValueType()) {
	Arg = TruncInput.getOperand(0);
	continue;
	}
	}
	break;
	}

	int FI = INT_MAX;
	if (Arg.getOpcode() == ISD::CopyFromReg) {
	unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
	if (!TargetRegisterInfo::isVirtualRegister(VR))
	return false;
	MachineInstr *Def = MRI->getVRegDef(VR);
	if (!Def)
	return false;
	if (!Flags.isByVal()) {
	if (!TII->isLoadFromStackSlot(*Def, FI))
	return false;
	} else {
	unsigned Opcode = Def->getOpcode();
	if ((Opcode == X86::LEA32r \|\| Opcode == X86::LEA64r \|\|
	Opcode == X86::LEA64_32r) &&
	Def->getOperand(1).isFI()) {
	FI = Def->getOperand(1).getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;
	}
	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
	if (Flags.isByVal())
	// ByVal argument is passed in as a pointer but it's now being
	// dereferenced. e.g.
	// define @foo(%struct.X* %A) {
	// tail call @bar(%struct.X* byval %A)
	// }
	return false;
	SDValue Ptr = Ld->getBasePtr();
	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
	if (!FINode)
	return false;
	FI = FINode->getIndex();
	} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
	FI = FINode->getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;

	assert(FI != INT_MAX);
	if (!MFI.isFixedObjectIndex(FI))
	return false;

	if (Offset != MFI.getObjectOffset(FI))
	return false;

	// If this is not byval, check that the argument stack object is immutable.
	// inalloca and argument copy elision can create mutable argument stack
	// objects. Byval objects can be mutated, but a byval call intends to pass the
	// mutated memory.
	if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
	return false;

	if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
	// If the argument location is wider than the argument type, check that any
	// extension flags match.
	if (Flags.isZExt() != MFI.isObjectZExt(FI) \|\|
	Flags.isSExt() != MFI.isObjectSExt(FI)) {
	return false;
	}
	}

	return Bytes == MFI.getObjectSize(FI);
	}

	/// Check whether the call is eligible for tail call optimization. Targets
	/// that want to do tail call optimization should implement this function.
	bool X86TargetLowering::IsEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	// If -tailcallopt is specified, make fastcc functions tail-callable.
	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();

	// If the function return type is x86_fp80 and the callee return type is not,
	// then the FP_EXTEND of the call result is not a nop. It's not safe to
	// perform a tailcall optimization here.
	if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
	return false;

	CallingConv::ID CallerCC = CallerF.getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;
	bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
	bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

	// Win64 functions have extra shadow space for argument homing. Don't do the
	// sibcall if the caller and callee have mismatched expectations for this
	// space.
	if (IsCalleeWin64 != IsCallerWin64)
	return false;

	if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
	if (canGuaranteeTCO(CalleeCC) && CCMatch)
	return true;
	return false;
	}

	// Look for obvious safe cases to perform tail call optimization that do not
	// require ABI changes. This is what gcc calls sibcall.

	// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
	// emit a special epilogue.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	if (RegInfo->needsStackRealignment(MF))
	return false;

	// Also avoid sibcall optimization if either caller or callee uses struct
	// return semantics.
	if (isCalleeStructRet \|\| isCallerStructRet)
	return false;

	// Do not sibcall optimize vararg calls unless all arguments are passed via
	// registers.
	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// Optimizing for varargs on Win64 is unlikely to be safe without
	// additional testing.
	if (IsCalleeWin64 \|\| IsCallerWin64)
	return false;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
	if (!ArgLocs[i].isRegLoc())
	return false;
	}

	// If the call result is in ST0 / ST1, it needs to be popped off the x87
	// stack. Therefore, if it's not used by the call it is not safe to optimize
	// this into a sibcall.
	bool Unused = false;
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	if (!Ins[i].Used) {
	Unused = true;
	break;
	}
	}
	if (Unused) {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
	return false;
	}
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	RetCC_X86, RetCC_X86))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	unsigned StackArgsSize = 0;

	// If the callee takes no arguments then go on to check the results of the
	// call.
	if (!Outs.empty()) {
	// Check if stack adjustment is needed. For now, do not do this if any
	// argument is passed on the stack.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	// Allocate shadow area for Win64
	if (IsCalleeWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	StackArgsSize = CCInfo.getNextStackOffset();

	if (CCInfo.getNextStackOffset()) {
	// Check if the arguments are already laid out in the right way as
	// the caller's fixed stack objects.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const MachineRegisterInfo *MRI = &MF.getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;
	if (!VA.isRegLoc()) {
	if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
	MFI, MRI, TII, VA))
	return false;
	}
	}
	}

	bool PositionIndependent = isPositionIndependent();
	// If the tailcall address may be in a register, then make sure it's
	// possible to register allocate for it. In 32-bit, the call address can
	// only target EAX, EDX, or ECX since the tail call must be scheduled after
	// callee-saved registers are restored. These happen to be the same
	// registers used to pass 'inreg' arguments so watch out for those.
	if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee)) \|\|
	PositionIndependent)) {
	unsigned NumInRegs = 0;
	// In PIC we need an extra register to formulate the address computation
	// for the callee.
	unsigned MaxInRegs = PositionIndependent ? 2 : 3;

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	if (!VA.isRegLoc())
	continue;
	unsigned Reg = VA.getLocReg();
	switch (Reg) {
	default: break;
	case X86::EAX: case X86::EDX: case X86::ECX:
	if (++NumInRegs == MaxInRegs)
	return false;
	break;
	}
	}
	}

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;
	}

	bool CalleeWillPop =
	X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt);

	if (unsigned BytesToPop =
	MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
	// If we have bytes to pop, the callee must pop them.
	bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
	if (!CalleePopMatches)
	return false;
	} else if (CalleeWillPop && StackArgsSize > 0) {
	// If we don't have bytes to pop, make sure the callee doesn't pop any.
	return false;
	}

	return true;
	}

	FastISel *
	X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return X86::createFastISel(funcInfo, libInfo);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Hooks
	//===----------------------------------------------------------------------===//

	static bool MayFoldLoad(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
	}

	static bool MayFoldIntoStore(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
	}

	static bool MayFoldIntoZeroExtend(SDValue Op) {
	if (Op.hasOneUse()) {
	unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
	return (ISD::ZERO_EXTEND == Opcode);
	}
	return false;
	}

	static bool isTargetShuffle(unsigned Opcode) {
	switch(Opcode) {
	default: return false;
	case X86ISD::BLENDI:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::SHUFP:
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::MOVLHPS:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLPS:
	case X86ISD::MOVLPD:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	case X86ISD::VBROADCAST:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMI:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	case X86ISD::VZEXT_MOVL:
	return true;
	}
	}

	static bool isTargetShuffleVariableMask(unsigned Opcode) {
	switch (Opcode) {
	default: return false;
	// Target Shuffles.
	case X86ISD::PSHUFB:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERMIL2:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	return true;
	// 'Faux' Target Shuffles.
	case ISD::AND:
	case X86ISD::ANDNP:
	return true;
	}
	}

	SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	int ReturnAddrIndex = FuncInfo->getRAIndex();

	if (ReturnAddrIndex == 0) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
	-(int64_t)SlotSize,
	false);
	FuncInfo->setRAIndex(ReturnAddrIndex);
	}

	return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
	}

	bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
	bool hasSymbolicDisplacement) {
	// Offset should fit into 32 bit immediate field.
	if (!isInt<32>(Offset))
	return false;

	// If we don't have a symbolic displacement - we don't have any extra
	// restrictions.
	if (!hasSymbolicDisplacement)
	return true;

	// FIXME: Some tweaks might be needed for medium code model.
	if (M != CodeModel::Small && M != CodeModel::Kernel)
	return false;

	// For small code model we assume that latest object is 16MB before end of 31
	// bits boundary. We may also accept pretty large negative constants knowing
	// that all objects are in the positive half of address space.
	if (M == CodeModel::Small && Offset < 1610241024)
	return true;

	// For kernel code model we know that all object resist in the negative half
	// of 32bits address space. We may not accept negative offsets, since they may
	// be just off and we may accept pretty large positive ones.
	if (M == CodeModel::Kernel && Offset >= 0)
	return true;

	return false;
	}

	/// Determines whether the callee is required to pop its own arguments.
	/// Callee pop is necessary to support tail calls.
	bool X86::isCalleePop(CallingConv::ID CallingConv,
	bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
	// If GuaranteeTCO is true, we force some calls to be callee pop so that we
	// can guarantee TCO.
	if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
	return true;

	switch (CallingConv) {
	default:
	return false;
	case CallingConv::X86_StdCall:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_VectorCall:
	return !is64Bit;
	}
	}

	/// \brief Return true if the condition is an unsigned comparison operation.
	static bool isX86CCUnsigned(unsigned X86CC) {
	switch (X86CC) {
	default:
	llvm_unreachable("Invalid integer condition!");
	case X86::COND_E:
	case X86::COND_NE:
	case X86::COND_B:
	case X86::COND_A:
	case X86::COND_BE:
	case X86::COND_AE:
	return true;
	case X86::COND_G:
	case X86::COND_GE:
	case X86::COND_L:
	case X86::COND_LE:
	return false;
	}
	}

	static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
	switch (SetCCOpcode) {
	default: llvm_unreachable("Invalid integer condition!");
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETGT: return X86::COND_G;
	case ISD::SETGE: return X86::COND_GE;
	case ISD::SETLT: return X86::COND_L;
	case ISD::SETLE: return X86::COND_LE;
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETULT: return X86::COND_B;
	case ISD::SETUGT: return X86::COND_A;
	case ISD::SETULE: return X86::COND_BE;
	case ISD::SETUGE: return X86::COND_AE;
	}
	}

	/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
	/// condition code, returning the condition code and the LHS/RHS of the
	/// comparison to make.
	static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
	bool isFP, SDValue &LHS, SDValue &RHS,
	SelectionDAG &DAG) {
	if (!isFP) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
	if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
	// X > -1 -> X == 0, jump !sign.
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_NS;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
	// X < 0 -> X == 0, jump on sign.
	return X86::COND_S;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
	// X < 1 -> X <= 0
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_LE;
	}
	}

	return TranslateIntegerX86CC(SetCCOpcode);
	}

	// First determine if it is required or is profitable to flip the operands.

	// If LHS is a foldable load, but RHS is not, flip the condition.
	if (ISD::isNON_EXTLoad(LHS.getNode()) &&
	!ISD::isNON_EXTLoad(RHS.getNode())) {
	SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
	std::swap(LHS, RHS);
	}

	switch (SetCCOpcode) {
	default: break;
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	std::swap(LHS, RHS);
	break;
	}

	// On a floating point condition, the flags are set as follows:
	// ZF PF CF op
	// 0 \| 0 \| 0 \| X > Y
	// 0 \| 0 \| 1 \| X < Y
	// 1 \| 0 \| 0 \| X == Y
	// 1 \| 1 \| 1 \| unordered
	switch (SetCCOpcode) {
	default: llvm_unreachable("Condcode should be pre-legalized away");
	case ISD::SETUEQ:
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETOLT: // flipped
	case ISD::SETOGT:
	case ISD::SETGT: return X86::COND_A;
	case ISD::SETOLE: // flipped
	case ISD::SETOGE:
	case ISD::SETGE: return X86::COND_AE;
	case ISD::SETUGT: // flipped
	case ISD::SETULT:
	case ISD::SETLT: return X86::COND_B;
	case ISD::SETUGE: // flipped
	case ISD::SETULE:
	case ISD::SETLE: return X86::COND_BE;
	case ISD::SETONE:
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETUO: return X86::COND_P;
	case ISD::SETO: return X86::COND_NP;
	case ISD::SETOEQ:
	case ISD::SETUNE: return X86::COND_INVALID;
	}
	}

	/// Is there a floating point cmov for the specific X86 condition code?
	/// Current x86 isa includes the following FP cmov instructions:
	/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
	static bool hasFPCMov(unsigned X86CC) {
	switch (X86CC) {
	default:
	return false;
	case X86::COND_B:
	case X86::COND_BE:
	case X86::COND_E:
	case X86::COND_P:
	case X86::COND_A:
	case X86::COND_AE:
	case X86::COND_NE:
	case X86::COND_NP:
	return true;
	}
	}


	bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {

	const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
	if (!IntrData)
	return false;

	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.flags = MachineMemOperand::MONone;
	Info.offset = 0;

	switch (IntrData->Type) {
	case EXPAND_FROM_MEM: {
	Info.ptrVal = I.getArgOperand(0);
	Info.memVT = MVT::getVT(I.getType());
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOLoad;
	break;
	}
	case COMPRESS_TO_MEM: {
	Info.ptrVal = I.getArgOperand(0);
	Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	Info.ptrVal = I.getArgOperand(0);
	MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
	MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
	ScalarVT = MVT::i8;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
	ScalarVT = MVT::i16;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
	ScalarVT = MVT::i32;

	Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	default:
	return false;
	}

	return true;
	}

	/// Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
	for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
	if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
	return true;
	}
	return false;
	}

	bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
	// relocation target a movq or addq instruction: don't let the load shrink.
	SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
	if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
	if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
	return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
	return true;
	}

	/// \brief Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0 \|\| BitSize > 64)
	return false;
	return true;
	}

	bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
	// TODO: It might be a win to ease or lift this restriction, but the generic
	// folds in DAGCombiner conflict with vector folds for an AVX512 target.
	if (VT.isVector() && Subtarget.hasAVX512())
	return false;

	return true;
	}

	bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	// Mask vectors support all subregister combinations and operations that
	// extract half of vector.
	if (ResVT.getVectorElementType() == MVT::i1)
	return Index == 0 \|\| ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
	(Index == ResVT.getVectorNumElements()));

	return (Index % ResVT.getVectorNumElements()) == 0;
	}

	bool X86TargetLowering::isCheapToSpeculateCttz() const {
	// Speculate cttz only if we can directly use TZCNT.
	return Subtarget.hasBMI();
	}

	bool X86TargetLowering::isCheapToSpeculateCtlz() const {
	// Speculate ctlz only if we can directly use LZCNT.
	return Subtarget.hasLZCNT();
	}

	bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
	EVT BitcastVT) const {
	if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
	return false;

	return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
	}

	bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
	const SelectionDAG &DAG) const {
	// Do not merge to float value size (128 bytes) if no implicit
	// float attribute is set.
	bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);

	if (NoFloat) {
	unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
	return (MemVT.getSizeInBits() <= MaxIntSize);
	}
	return true;
	}

	bool X86TargetLowering::isCtlzFast() const {
	return Subtarget.hasFastLZCNT();
	}

	bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	return true;
	}

	bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
	if (!Subtarget.hasBMI())
	return false;

	// There are only 32-bit and 64-bit forms for 'andn'.
	EVT VT = Y.getValueType();
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	return true;
	}

	MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
	MVT VT = MVT::getIntegerVT(NumBits);
	if (isTypeLegal(VT))
	return VT;

	// PMOVMSKB can handle this.
	if (NumBits == 128 && isTypeLegal(MVT::v16i8))
	return MVT::v16i8;

	// VPMOVMSKB can handle this.
	if (NumBits == 256 && isTypeLegal(MVT::v32i8))
	return MVT::v32i8;

	// TODO: Allow 64-bit type for 32-bit target.
	// TODO: 512-bit types should be allowed, but make sure that those
	// cases are handled in combineVectorSizedSetCCEquality().

	return MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	/// Val is the undef sentinel value or equal to the specified value.
	static bool isUndefOrEqual(int Val, int CmpVal) {
	return ((Val == SM_SentinelUndef) \|\| (Val == CmpVal));
	}

	/// Val is either the undef or zero sentinel value.
	static bool isUndefOrZero(int Val) {
	return ((Val == SM_SentinelUndef) \|\| (Val == SM_SentinelZero));
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is the undef sentinel value.
	static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (Mask[i] != SM_SentinelUndef)
	return false;
	return true;
	}

	/// Return true if Val is undef or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrInRange(int Val, int Low, int Hi) {
	return (Val == SM_SentinelUndef) \|\| (Val >= Low && Val < Hi);
	}

	/// Return true if every element in Mask is undef or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrInRange(ArrayRef<int> Mask,
	int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if Val is undef, zero or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
	return isUndefOrZero(Val) \|\| (Val >= Low && Val < Hi);
	}

	/// Return true if every element in Mask is undef, zero or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrZeroOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size]. or is undef.
	static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
	unsigned Pos, unsigned Size, int Low) {
	for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
	if (!isUndefOrEqual(Mask[i], Low))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size], or is undef or is zero.
	static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
	if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is undef or is zero.
	static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (!isUndefOrZero(Mask[i]))
	return false;
	return true;
	}

	/// \brief Helper function to test whether a shuffle mask could be
	/// simplified by widening the elements being shuffled.
	///
	/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
	/// leaves it in an unspecified state.
	///
	/// NOTE: This must handle normal vector shuffle masks and target vector
	/// shuffle masks. The latter have the special property of a '-2' representing
	/// a zero-ed lane of a vector.
	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	SmallVectorImpl<int> &WidenedMask) {
	WidenedMask.assign(Mask.size() / 2, 0);
	for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
	int M0 = Mask[i];
	int M1 = Mask[i + 1];

	// If both elements are undef, its trivial.
	if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
	WidenedMask[i / 2] = SM_SentinelUndef;
	continue;
	}

	// Check for an undef mask and a mask value properly aligned to fit with
	// a pair of values. If we find such a case, use the non-undef mask's value.
	if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
	WidenedMask[i / 2] = M1 / 2;
	continue;
	}
	if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// When zeroing, we need to spread the zeroing across both lanes to widen.
	if (M0 == SM_SentinelZero \|\| M1 == SM_SentinelZero) {
	if ((M0 == SM_SentinelZero \|\| M0 == SM_SentinelUndef) &&
	(M1 == SM_SentinelZero \|\| M1 == SM_SentinelUndef)) {
	WidenedMask[i / 2] = SM_SentinelZero;
	continue;
	}
	return false;
	}

	// Finally check if the two mask values are adjacent and aligned with
	// a pair.
	if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// Otherwise we can't safely widen the elements used in this shuffle.
	return false;
	}
	assert(WidenedMask.size() == Mask.size() / 2 &&
	"Incorrect size of mask after widening the elements!");

	return true;
	}

	/// Returns true if Elt is a constant zero or a floating point constant +0.0.
	bool X86::isZeroNode(SDValue Elt) {
	return isNullConstant(Elt) \|\| isNullFPConstant(Elt);
	}

	// Build a vector of constants.
	// Use an UNDEF node if MaskElt == -1.
	// Split 64-bit constants in the 32-bit mode.
	static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
	const SDLoc &dl, bool IsMask = false) {

	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsUndef = Values[i] < 0 && IsMask;
	SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(Values[i], dl, EltVT);
	Ops.push_back(OpNode);
	if (Split)
	Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(0, dl, EltVT));
	}
	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	if (Split)
	ConstsNode = DAG.getBitcast(VT, ConstsNode);
	return ConstsNode;
	}

	static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
	MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert(Bits.size() == Undefs.getBitWidth() &&
	"Unequal constant and undef arrays");
	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
	if (Undefs[i]) {
	Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
	continue;
	}
	const APInt &V = Bits[i];
	assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
	if (Split) {
	Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
	Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
	} else if (EltVT == MVT::f32) {
	APFloat FV(APFloat::IEEEsingle(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else if (EltVT == MVT::f64) {
	APFloat FV(APFloat::IEEEdouble(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else {
	Ops.push_back(DAG.getConstant(V, dl, EltVT));
	}
	}

	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	return DAG.getBitcast(VT, ConstsNode);
	}

	/// Returns a vector of specified type with all zero elements.
	static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\|
	VT.getVectorElementType() == MVT::i1) &&
	"Unexpected vector type");

	// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
	// type. This ensures they get CSE'd. But if the integer type is not
	// available, use a floating-point +0.0 instead.
	SDValue Vec;
	if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
	Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
	} else if (VT.getVectorElementType() == MVT::i1) {
	assert((Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) &&
	"Unexpected vector type");
	assert((Subtarget.hasVLX() \|\| VT.getVectorNumElements() >= 8) &&
	"Unexpected vector type");
	Vec = DAG.getConstant(0, dl, VT);
	} else {
	unsigned Num32BitElts = VT.getSizeInBits() / 32;
	Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
	}
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
	const SDLoc &dl, unsigned vectorWidth) {
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	unsigned Factor = VT.getSizeInBits()/vectorWidth;
	EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
	VT.getVectorNumElements()/Factor);

	// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
	unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	// If the input is a buildvector just emit a smaller one.
	if (Vec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(ResultVT, dl,
	Vec->ops().slice(IdxVal, ElemsPerChunk));

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
	}

	/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
	/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
	/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
	/// instructions or a simple subregister reference. Idx is an index in the
	/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering EXTRACT_VECTOR_ELT operations easier.
	static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((Vec.getValueType().is256BitVector() \|\|
	Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 128);
	}

	/// Generate a DAG to grab 256-bits from a 512-bit vector.
	static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 256);
	}

	static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl,
	unsigned vectorWidth) {
	assert((vectorWidth == 128 \|\| vectorWidth == 256) &&
	"Unsupported vector width");
	// Inserting UNDEF is Result
	if (Vec.isUndef())
	return Result;
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	EVT ResultVT = Result.getValueType();

	// Insert the relevant vectorWidth bits.
	unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
	}

	/// Generate a DAG to put 128-bits into a vector > 128 bits. This
	/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
	/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
	/// simple superregister reference. Idx is an index in the 128 bits
	/// we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering INSERT_VECTOR_ELT operations easier.
	static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
	}

	static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
	}

	// Return true if the instruction zeroes the unused upper part of the
	// destination and accepts mask.
	static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
	switch (Opcode) {
	default:
	return false;
	case X86ISD::TESTM:
	case X86ISD::TESTNM:
	case X86ISD::PCMPEQM:
	case X86ISD::PCMPGTM:
	case X86ISD::CMPM:
	case X86ISD::CMPMU:
	case X86ISD::CMPM_RND:
	return true;
	}
	}

	/// Insert i1-subvector to i1-vector.
	static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue SubVec = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	// Inserting undef is a nop. We can just return the original vector.
	if (SubVec.isUndef())
	return Vec;

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
	return Op;

	MVT OpVT = Op.getSimpleValueType();
	unsigned NumElems = OpVT.getVectorNumElements();

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

	// Extend to natively supported kshift.
	MVT WideOpVT = OpVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8)
	WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

	// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
	// if necessary.
	if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// May need to promote to a legal type.
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	SubVec, Idx);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	MVT SubVecVT = SubVec.getSimpleValueType();
	unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

	assert(IdxVal + SubVecNumElems <= NumElems &&
	IdxVal % SubVecVT.getSizeInBits() == 0 &&
	"Unexpected index value in INSERT_SUBVECTOR");

	SDValue Undef = DAG.getUNDEF(WideOpVT);

	if (IdxVal == 0) {
	// Zero lower bits of the Vec
	SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
	ZeroIdx);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	// Merge them together, SubVec should be zero extended.
	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	SubVec, ZeroIdx);
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, SubVec, ZeroIdx);

	if (Vec.isUndef()) {
	assert(IdxVal != 0 && "Unexpected index");
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	assert(IdxVal != 0 && "Unexpected index");
	NumElems = WideOpVT.getVectorNumElements();
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(ShiftLeft, dl, MVT::i8));
	if (ShiftRight != 0)
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getConstant(ShiftRight, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	// Simple case when we put subvector in the upper part
	if (IdxVal + SubVecNumElems == NumElems) {
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	if (SubVecNumElems * 2 == NumElems) {
	// Special case, use legal zero extending insert_subvector. This allows
	// isel to opimitize when bits are known zero.
	Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	Vec, ZeroIdx);
	} else {
	// Otherwise use explicit shifts to zero the bits.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, Vec, ZeroIdx);
	NumElems = WideOpVT.getVectorNumElements();
	SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	}
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	// Inserting into the middle is more complicated.

	NumElems = WideOpVT.getVectorNumElements();

	// Widen the vector if needed.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
	// Move the current value of the bit to be replace to the lsbs.
	Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	// Xor with the new bit.
	Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
	// Shift to MSB, filling bottom bits with 0.
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
	DAG.getConstant(ShiftLeft, dl, MVT::i8));
	// Shift to the final position, filling upper bits with 0.
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
	DAG.getConstant(ShiftRight, dl, MVT::i8));
	// Xor with original vector leaving the new value.
	Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
	// Reduce to original width if needed.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
	/// instructions. This is used because creating CONCAT_VECTOR nodes of
	/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
	/// large BUILD_VECTORS.
	static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
	unsigned NumElems, SelectionDAG &DAG,
	const SDLoc &dl) {
	SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
	return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
	}

	static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
	unsigned NumElems, SelectionDAG &DAG,
	const SDLoc &dl) {
	SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
	return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
	}

	/// Returns a vector of specified type with all bits set.
	/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
	/// Then bitcast to their original type, ensuring they get CSE'd.
	static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected a 128/256/512-bit vector type");

	APInt Ones = APInt::getAllOnesValue(32);
	unsigned NumElts = VT.getSizeInBits() / 32;
	SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
	SelectionDAG &DAG) {
	EVT InVT = In.getValueType();
	assert((X86ISD::VSEXT == Opc \|\| X86ISD::VZEXT == Opc) && "Unexpected opcode");

	if (VT.is128BitVector() && InVT.is128BitVector())
	return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
	: DAG.getZeroExtendVectorInReg(In, DL, VT);

	// For 256-bit vectors, we only need the lower (128-bit) input half.
	// For 512-bit vectors, we only need the lower input half or quarter.
	if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
	int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
	In = extractSubVector(In, 0, DAG, DL,
	std::max(128, (int)VT.getSizeInBits() / Scale));
	}

	return DAG.getNode(Opc, DL, VT, In);
	}

	/// Returns a vector_shuffle node for an unpackl operation.
	static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / true, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Returns a vector_shuffle node for an unpackh operation.
	static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / false, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Return a vector_shuffle of the specified vector of zero or undef vector.
	/// This produces a shuffle where the low element of V2 is swizzled into the
	/// zero/undef vector, landing at element Idx.
	/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
	static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
	bool IsZero,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = V2.getSimpleValueType();
	SDValue V1 = IsZero
	? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
	int NumElems = VT.getVectorNumElements();
	SmallVector<int, 16> MaskVec(NumElems);
	for (int i = 0; i != NumElems; ++i)
	// If this is the insertion idx, put the low elt of V2 here.
	MaskVec[i] = (i == Idx) ? NumElems : i;
	return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
	}

	static SDValue peekThroughBitcasts(SDValue V) {
	while (V.getNode() && V.getOpcode() == ISD::BITCAST)
	V = V.getOperand(0);
	return V;
	}

	static SDValue peekThroughOneUseBitcasts(SDValue V) {
	while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
	V.getOperand(0).hasOneUse())
	V = V.getOperand(0);
	return V;
	}

	static const Constant *getTargetConstantFromNode(SDValue Op) {
	Op = peekThroughBitcasts(Op);

	auto *Load = dyn_cast<LoadSDNode>(Op);
	if (!Load)
	return nullptr;

	SDValue Ptr = Load->getBasePtr();
	if (Ptr->getOpcode() == X86ISD::Wrapper \|\|
	Ptr->getOpcode() == X86ISD::WrapperRIP)
	Ptr = Ptr->getOperand(0);

	auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
	if (!CNode \|\| CNode->isMachineConstantPoolEntry())
	return nullptr;

	return dyn_cast<Constant>(CNode->getConstVal());
	}

	// Extract raw constant bits from constant pools.
	static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
	APInt &UndefElts,
	SmallVectorImpl<APInt> &EltBits,
	bool AllowWholeUndefs = true,
	bool AllowPartialUndefs = true) {
	assert(EltBits.empty() && "Expected an empty EltBits vector");

	Op = peekThroughBitcasts(Op);

	EVT VT = Op.getValueType();
	unsigned SizeInBits = VT.getSizeInBits();
	assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
	unsigned NumElts = SizeInBits / EltSizeInBits;

	// Bitcast a source array of element bits to the target size.
	auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
	unsigned NumSrcElts = UndefSrcElts.getBitWidth();
	unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
	assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
	"Constant bit sizes don't match");

	// Don't split if we don't allow undef bits.
	bool AllowUndefs = AllowWholeUndefs \|\| AllowPartialUndefs;
	if (UndefSrcElts.getBoolValue() && !AllowUndefs)
	return false;

	// If we're already the right size, don't bother bitcasting.
	if (NumSrcElts == NumElts) {
	UndefElts = UndefSrcElts;
	EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
	return true;
	}

	// Extract all the undef/constant element data and pack into single bitsets.
	APInt UndefBits(SizeInBits, 0);
	APInt MaskBits(SizeInBits, 0);

	for (unsigned i = 0; i != NumSrcElts; ++i) {
	unsigned BitOffset = i * SrcEltSizeInBits;
	if (UndefSrcElts[i])
	UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
	MaskBits.insertBits(SrcEltBits[i], BitOffset);
	}

	// Split the undef/constant single bitset data into the target elements.
	UndefElts = APInt(NumElts, 0);
	EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

	for (unsigned i = 0; i != NumElts; ++i) {
	unsigned BitOffset = i * EltSizeInBits;
	APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

	// Only treat an element as UNDEF if all bits are UNDEF.
	if (UndefEltBits.isAllOnesValue()) {
	if (!AllowWholeUndefs)
	return false;
	UndefElts.setBit(i);
	continue;
	}

	// If only some bits are UNDEF then treat them as zero (or bail if not
	// supported).
	if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
	return false;

	APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
	EltBits[i] = Bits.getZExtValue();
	}
	return true;
	};

	// Collect constant bits and insert into mask/undef bit masks.
	auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
	unsigned UndefBitIndex) {
	if (!Cst)
	return false;
	if (isa<UndefValue>(Cst)) {
	Undefs.setBit(UndefBitIndex);
	return true;
	}
	if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
	Mask = CInt->getValue();
	return true;
	}
	if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
	Mask = CFP->getValueAPF().bitcastToAPInt();
	return true;
	}
	return false;
	};

	// Handle UNDEFs.
	if (Op.isUndef()) {
	APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
	SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract scalar constant bits.
	if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt UndefSrcElts = APInt::getNullValue(1);
	SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from build vector.
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantSDNode>(Src);
	SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from constant pool vector.
	if (auto *Cst = getTargetConstantFromNode(Op)) {
	Type *CstTy = Cst->getType();
	if (!CstTy->isVectorTy() \|\| (SizeInBits != CstTy->getPrimitiveSizeInBits()))
	return false;

	unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
	unsigned NumSrcElts = CstTy->getVectorNumElements();

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0; i != NumSrcElts; ++i)
	if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
	UndefSrcElts, i))
	return false;

	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from a broadcasted constant pool scalar.
	if (Op.getOpcode() == X86ISD::VBROADCAST &&
	EltSizeInBits <= VT.getScalarSizeInBits()) {
	if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
	unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
	if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
	if (UndefSrcElts[0])
	UndefSrcElts.setBits(0, NumSrcElts);
	SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	}
	}

	// Extract a rematerialized scalar constant insertion.
	if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
	Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
	isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits;
	auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
	SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
	SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	return false;
	}

	static bool getTargetShuffleMaskIndices(SDValue MaskNode,
	unsigned MaskEltSizeInBits,
	SmallVectorImpl<uint64_t> &RawMask) {
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;

	// Extract the raw target constant bits.
	// FIXME: We currently don't support UNDEF bits or mask entries.
	if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
	EltBits, /* AllowWholeUndefs */ false,
	/* AllowPartialUndefs */ false))
	return false;

	// Insert the extracted elements into the mask.
	for (APInt Elt : EltBits)
	RawMask.push_back(Elt.getZExtValue());

	return true;
	}

	/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
	/// Note: This ignores saturation, so inputs must be checked first.
	static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
	bool Unary) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
	unsigned Offset = Unary ? 0 : NumElts;

	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
	Mask.push_back(Elt + (Lane * NumEltsPerLane));
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
	Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
	}
	}

	/// Calculates the shuffle mask corresponding to the target-specific opcode.
	/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
	/// operands in \p Ops, and returns true.
	/// Sets \p IsUnary to true if only one source is used. Note that this will set
	/// IsUnary for shuffles which use a single input multiple times, and in those
	/// cases it will adjust the mask to only have indices within that single input.
	/// It is an error to call this with non-empty Mask/Ops vectors.
	static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
	SmallVectorImpl<SDValue> &Ops,
	SmallVectorImpl<int> &Mask, bool &IsUnary) {
	unsigned NumElems = VT.getVectorNumElements();
	SDValue ImmN;

	assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
	assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");

	IsUnary = false;
	bool IsFakeUnary = false;
	switch(N->getOpcode()) {
	case X86ISD::BLENDI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUFP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::INSERTPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::EXTRQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(1)) &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	int BitLen = N->getConstantOperandVal(1);
	int BitIdx = N->getConstantOperandVal(2);
	DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
	IsUnary = true;
	}
	break;
	case X86ISD::INSERTQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(2)) &&
	isa<ConstantSDNode>(N->getOperand(3))) {
	int BitLen = N->getConstantOperandVal(2);
	int BitIdx = N->getConstantOperandVal(3);
	DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	}
	break;
	case X86ISD::UNPCKH:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKHMask(VT, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::UNPCKL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKLMask(VT, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVHLPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVHLPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVLHPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVLHPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::PALIGNR:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(0));
	break;
	case X86ISD::VSHLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::VSRLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFD:
	case X86ISD::VPERMILPI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFHW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFLW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::VZEXT_MOVL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeZeroMoveLowMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::VBROADCAST: {
	SDValue N0 = N->getOperand(0);
	// See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
	// add the pre-extracted value to the Ops vector.
	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N0.getOperand(0).getValueType() == VT &&
	N0.getConstantOperandVal(1) == 0)
	Ops.push_back(N0.getOperand(0));

	// We only decode broadcasts of same-sized vectors, unless the broadcast
	// came from an extract from the original width. If we found one, we
	// pushed it the Ops vector above.
	if (N0.getValueType() == VT \|\| !Ops.empty()) {
	DecodeVectorBroadcast(VT, Mask);
	IsUnary = true;
	break;
	}
	return false;
	}
	case X86ISD::VPERMILPV: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMILPMask(VT, RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMILPMask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::PSHUFB: {
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
	DecodePSHUFBMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodePSHUFBMask(C, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
	break;
	case X86ISD::VPERM2X128:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVSLDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSLDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSHDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSHDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVDDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVDDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVLPD:
	case X86ISD::MOVLPS:
	// Not yet implemented
	return false;
	case X86ISD::VPERMIL2: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SDValue MaskNode = N->getOperand(2);
	SDValue CtrlNode = N->getOperand(3);
	if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
	unsigned CtrlImm = CtrlOp->getZExtValue();
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
	break;
	}
	}
	return false;
	}
	case X86ISD::VPPERM: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
	DecodeVPPERMMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPPERMMask(C, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV: {
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
	Ops.push_back(N->getOperand(1));
	SDValue MaskNode = N->getOperand(0);
	SmallVector<uint64_t, 32> RawMask;
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMVMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMVMask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV3: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
	Ops.push_back(N->getOperand(0));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMV3Mask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMIV3: {
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(0);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMV3Mask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	default: llvm_unreachable("unknown target shuffle node");
	}

	// Empty mask indicates the decode failed.
	if (Mask.empty())
	return false;

	// Check if we're getting a shuffle mask with zero'd elements.
	if (!AllowSentinelZero)
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return false;

	// If we have a fake unary shuffle, the shuffle mask is spread across two
	// inputs that are actually the same node. Re-map the mask to always point
	// into the first input.
	if (IsFakeUnary)
	for (int &M : Mask)
	if (M >= (int)Mask.size())
	M -= Mask.size();

	// If we didn't already add operands in the opcode-specific code, default to
	// adding 1 or 2 operands starting at 0.
	if (Ops.empty()) {
	Ops.push_back(N->getOperand(0));
	if (!IsUnary \|\| IsFakeUnary)
	Ops.push_back(N->getOperand(1));
	}

	return true;
	}

	/// Check a target shuffle mask's inputs to see if we can set any values to
	/// SM_SentinelZero - this is for elements that are known to be zero
	/// (not just zeroable) from their inputs.
	/// Returns true if the target shuffle mask was decoded.
	static bool setTargetShuffleZeroElements(SDValue N,
	SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops) {
	bool IsUnary;
	if (!isTargetShuffle(N.getOpcode()))
	return false;

	MVT VT = N.getSimpleValueType();
	if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
	return false;

	SDValue V1 = Ops[0];
	SDValue V2 = IsUnary ? V1 : Ops[1];

	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	assert((VT.getSizeInBits() % Mask.size()) == 0 &&
	"Illegal split of shuffle value type");
	unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();

	// Extract known constant input data.
	APInt UndefSrcElts[2];
	SmallVector<APInt, 32> SrcEltBits[2];
	bool IsSrcConstant[2] = {
	getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
	SrcEltBits[0], true, false),
	getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
	SrcEltBits[1], true, false)};

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];

	// Already decoded as SM_SentinelZero / SM_SentinelUndef.
	if (M < 0)
	continue;

	// Determine shuffle input and normalize the mask.
	unsigned SrcIdx = M / Size;
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// We are referencing an UNDEF input.
	if (V.isUndef()) {
	Mask[i] = SM_SentinelUndef;
	continue;
	}

	// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
	// TODO: We currently only set UNDEF for integer types - floats use the same
	// registers as vectors and many of the scalar folded loads rely on the
	// SCALAR_TO_VECTOR pattern.
	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Size % V.getValueType().getVectorNumElements()) == 0) {
	int Scale = Size / V.getValueType().getVectorNumElements();
	int Idx = M / Scale;
	if (Idx != 0 && !VT.isFloatingPoint())
	Mask[i] = SM_SentinelUndef;
	else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
	Mask[i] = SM_SentinelZero;
	continue;
	}

	// Attempt to extract from the source's constant bits.
	if (IsSrcConstant[SrcIdx]) {
	if (UndefSrcElts[SrcIdx][M])
	Mask[i] = SM_SentinelUndef;
	else if (SrcEltBits[SrcIdx][M] == 0)
	Mask[i] = SM_SentinelZero;
	}
	}

	assert(VT.getVectorNumElements() == Mask.size() &&
	"Different mask size from vector size!");
	return true;
	}

	// Attempt to decode ops that could be represented as a shuffle mask.
	// The decoded shuffle mask may contain a different number of elements to the
	// destination value type.
	static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops,
	SelectionDAG &DAG) {
	Mask.clear();
	Ops.clear();

	MVT VT = N.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumSizeInBits = VT.getSizeInBits();
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
	"Expected byte aligned value types");

	unsigned Opcode = N.getOpcode();
	switch (Opcode) {
	case ISD::AND:
	case X86ISD::ANDNP: {
	// Attempt to decode as a per-byte mask.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	bool IsAndN = (X86ISD::ANDNP == Opcode);
	uint64_t ZeroMask = IsAndN ? 255 : 0;
	if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
	return false;
	for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
	if (UndefElts[i]) {
	Mask.push_back(SM_SentinelUndef);
	continue;
	}
	uint64_t ByteBits = EltBits[i].getZExtValue();
	if (ByteBits != 0 && ByteBits != 255)
	return false;
	Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
	}
	Ops.push_back(IsAndN ? N1 : N0);
	return true;
	}
	case ISD::SCALAR_TO_VECTOR: {
	// Match against a scalar_to_vector of an extract from a vector,
	// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
	SDValue N0 = N.getOperand(0);
	SDValue SrcExtract;

	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	N0.getOperand(0).getValueType() == VT) \|\|
	(N0.getOpcode() == X86ISD::PEXTRW &&
	N0.getOperand(0).getValueType() == MVT::v8i16) \|\|
	(N0.getOpcode() == X86ISD::PEXTRB &&
	N0.getOperand(0).getValueType() == MVT::v16i8)) {
	SrcExtract = N0;
	}

	if (!SrcExtract \|\| !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
	return false;

	SDValue SrcVec = SrcExtract.getOperand(0);
	EVT SrcVT = SrcVec.getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;

	unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
	if (NumSrcElts <= SrcIdx)
	return false;

	Ops.push_back(SrcVec);
	Mask.push_back(SrcIdx);
	Mask.append(NumZeros, SM_SentinelZero);
	Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
	return true;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	SDValue InVec = N.getOperand(0);
	SDValue InScl = N.getOperand(1);
	uint64_t InIdx = N.getConstantOperandVal(2);
	assert(InIdx < NumElts && "Illegal insertion index");

	// Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
	if (X86::isZeroNode(InScl)) {
	Ops.push_back(InVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
	return true;
	}

	// Attempt to recognise a PINSR(PEXTR) shuffle pattern.
	// TODO: Expand this to support INSERT_VECTOR_ELT/etc.
	unsigned ExOp =
	(X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
	if (InScl.getOpcode() != ExOp)
	return false;

	SDValue ExVec = InScl.getOperand(0);
	uint64_t ExIdx = InScl.getConstantOperandVal(1);
	assert(ExIdx < NumElts && "Illegal extraction index");
	Ops.push_back(InVec);
	Ops.push_back(ExVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
	return true;
	}
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
	N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
	"Unexpected input value type");

	// If we know input saturation won't happen we can treat this
	// as a truncation shuffle.
	if (Opcode == X86ISD::PACKSS) {
	if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) \|\|
	(!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
	return false;
	} else {
	APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
	if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) \|\|
	(!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
	return false;
	}

	bool IsUnary = (N0 == N1);

	Ops.push_back(N0);
	if (!IsUnary)
	Ops.push_back(N1);

	createPackShuffleMask(VT, Mask, IsUnary);
	return true;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	uint64_t ShiftVal = N.getConstantOperandVal(1);
	// Out of range bit shifts are guaranteed to be zero.
	if (NumBitsPerElt <= ShiftVal) {
	Mask.append(NumElts, SM_SentinelZero);
	return true;
	}

	// We can only decode 'whole byte' bit shifts as shuffles.
	if ((ShiftVal % 8) != 0)
	break;

	uint64_t ByteShift = ShiftVal / 8;
	unsigned NumBytes = NumSizeInBits / 8;
	unsigned NumBytesPerElt = NumBitsPerElt / 8;
	Ops.push_back(N.getOperand(0));

	// Clear mask to all zeros and insert the shifted byte indices.
	Mask.append(NumBytes, SM_SentinelZero);

	if (X86ISD::VSHLI == Opcode) {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j] = i + j - ByteShift;
	} else {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j - ByteShift] = i + j;
	}
	return true;
	}
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case X86ISD::VZEXT: {
	// TODO - add support for VPMOVZX with smaller input vector types.
	SDValue Src = N.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	if (NumSizeInBits != SrcVT.getSizeInBits())
	break;
	DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
	Ops.push_back(Src);
	return true;
	}
	}

	return false;
	}

	/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
	static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask) {
	int MaskWidth = Mask.size();
	SmallVector<SDValue, 16> UsedInputs;
	for (int i = 0, e = Inputs.size(); i < e; ++i) {
	int lo = UsedInputs.size() * MaskWidth;
	int hi = lo + MaskWidth;

	// Strip UNDEF input usage.
	if (Inputs[i].isUndef())
	for (int &M : Mask)
	if ((lo <= M) && (M < hi))
	M = SM_SentinelUndef;

	// Check for unused inputs.
	if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
	UsedInputs.push_back(Inputs[i]);
	continue;
	}
	for (int &M : Mask)
	if (lo <= M)
	M -= MaskWidth;
	}
	Inputs = UsedInputs;
	}

	/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
	/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
	/// remaining input indices in case we now have a unary shuffle and adjust the
	/// inputs accordingly.
	/// Returns true if the target shuffle mask was decoded.
	static bool resolveTargetShuffleInputs(SDValue Op,
	SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	SelectionDAG &DAG) {
	if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
	if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
	return false;

	resolveTargetShuffleInputsAndMask(Inputs, Mask);
	return true;
	}

	/// Returns the scalar element that will make up the ith
	/// element of the result of the vector shuffle.
	static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
	unsigned Depth) {
	if (Depth == 6)
	return SDValue(); // Limit search depth.

	SDValue V = SDValue(N, 0);
	EVT VT = V.getValueType();
	unsigned Opcode = V.getOpcode();

	// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
	if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
	int Elt = SV->getMaskElt(Index);

	if (Elt < 0)
	return DAG.getUNDEF(VT.getVectorElementType());

	unsigned NumElems = VT.getVectorNumElements();
	SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
	: SV->getOperand(1);
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
	}

	// Recurse into target specific vector shuffles to find scalars.
	if (isTargetShuffle(Opcode)) {
	MVT ShufVT = V.getSimpleValueType();
	MVT ShufSVT = ShufVT.getVectorElementType();
	int NumElems = (int)ShufVT.getVectorNumElements();
	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 16> ShuffleOps;
	bool IsUnary;

	if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
	return SDValue();

	int Elt = ShuffleMask[Index];
	if (Elt == SM_SentinelZero)
	return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
	if (Elt == SM_SentinelUndef)
	return DAG.getUNDEF(ShufSVT);

	assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
	SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
	Depth+1);
	}

	// Actual nodes that may contain scalar elements
	if (Opcode == ISD::BITCAST) {
	V = V.getOperand(0);
	EVT SrcVT = V.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	if (!SrcVT.isVector() \|\| SrcVT.getVectorNumElements() != NumElems)
	return SDValue();
	}

	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return (Index == 0) ? V.getOperand(0)
	: DAG.getUNDEF(VT.getVectorElementType());

	if (V.getOpcode() == ISD::BUILD_VECTOR)
	return V.getOperand(Index);

	return SDValue();
	}

	// Use PINSRB/PINSRW/PINSRD to create a build vector.
	static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	((VT == MVT::v16i8 \|\| VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
	"Illegal vector insertion");

	SDLoc dl(Op);
	SDValue V;
	bool First = true;

	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsNonZero = (NonZeros & (1 << i)) != 0;
	if (!IsNonZero)
	continue;

	// If the build vector contains zeros or our first insertion is not the
	// first index then insert into zero vector to break any register
	// dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
	if (First) {
	First = false;
	if (NumZero \|\| 0 != i)
	V = getZeroVector(VT, Subtarget, DAG, dl);
	else {
	assert(0 == i && "Expected insertion into zero-index");
	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
	V = DAG.getBitcast(VT, V);
	continue;
	}
	}
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
	DAG.getIntPtrConstant(i, dl));
	}

	return V;
	}

	/// Custom lower build_vector of v16i8.
	static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 8 && !Subtarget.hasSSE41())
	return SDValue();

	// SSE4.1 - use PINSRB to insert each byte directly.
	if (Subtarget.hasSSE41())
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);

	SDLoc dl(Op);
	SDValue V;
	bool First = true;

	// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
	for (unsigned i = 0; i < 16; ++i) {
	bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
	if (ThisIsNonZero && First) {
	if (NumZero)
	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
	else
	V = DAG.getUNDEF(MVT::v8i16);
	First = false;
	}

	if ((i & 1) != 0) {
	// FIXME: Investigate extending to i32 instead of just i16.
	// FIXME: Investigate combining the first 4 bytes as a i32 instead.
	SDValue ThisElt, LastElt;
	bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
	if (LastIsNonZero) {
	LastElt =
	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
	}
	if (ThisIsNonZero) {
	ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
	ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
	DAG.getConstant(8, dl, MVT::i8));
	if (LastIsNonZero)
	ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
	} else
	ThisElt = LastElt;

	if (ThisElt) {
	if (1 == i) {
	V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
	: DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
	V = DAG.getBitcast(MVT::v8i16, V);
	} else {
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
	DAG.getIntPtrConstant(i / 2, dl));
	}
	}
	}
	}

	return DAG.getBitcast(MVT::v16i8, V);
	}

	/// Custom lower build_vector of v8i16.
	static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 4 && !Subtarget.hasSSE41())
	return SDValue();

	// Use PINSRW to insert each byte directly.
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);
	}

	/// Custom lower build_vector of v4i32 or v4f32.
	static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Find all zeroable elements.
	std::bitset<4> Zeroable;
	for (int i=0; i < 4; ++i) {
	SDValue Elt = Op->getOperand(i);
	Zeroable[i] = (Elt.isUndef() \|\| X86::isZeroNode(Elt));
	}
	assert(Zeroable.size() - Zeroable.count() > 1 &&
	"We expect at least two non-zero elements!");

	// We only know how to deal with build_vector nodes where elements are either
	// zeroable or extract_vector_elt with constant index.
	SDValue FirstNonZero;
	unsigned FirstNonZeroIdx;
	for (unsigned i=0; i < 4; ++i) {
	if (Zeroable[i])
	continue;
	SDValue Elt = Op->getOperand(i);
	if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Elt.getOperand(1)))
	return SDValue();
	// Make sure that this node is extracting from a 128-bit vector.
	MVT VT = Elt.getOperand(0).getSimpleValueType();
	if (!VT.is128BitVector())
	return SDValue();
	if (!FirstNonZero.getNode()) {
	FirstNonZero = Elt;
	FirstNonZeroIdx = i;
	}
	}

	assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
	SDValue V1 = FirstNonZero.getOperand(0);
	MVT VT = V1.getSimpleValueType();

	// See if this build_vector can be lowered as a blend with zero.
	SDValue Elt;
	unsigned EltMaskIdx, EltIdx;
	int Mask[4];
	for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
	if (Zeroable[EltIdx]) {
	// The zero vector will be on the right hand side.
	Mask[EltIdx] = EltIdx+4;
	continue;
	}

	Elt = Op->getOperand(EltIdx);
	// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
	EltMaskIdx = Elt.getConstantOperandVal(1);
	if (Elt.getOperand(0) != V1 \|\| EltMaskIdx != EltIdx)
	break;
	Mask[EltIdx] = EltIdx;
	}

	if (EltIdx == 4) {
	// Let the shuffle legalizer deal with blend operations.
	SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
	if (V1.getSimpleValueType() != VT)
	V1 = DAG.getBitcast(VT, V1);
	return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
	}

	// See if we can lower this build_vector to a INSERTPS.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDValue V2 = Elt.getOperand(0);
	if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
	V1 = SDValue();

	bool CanFold = true;
	for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
	if (Zeroable[i])
	continue;

	SDValue Current = Op->getOperand(i);
	SDValue SrcVector = Current->getOperand(0);
	if (!V1.getNode())
	V1 = SrcVector;
	CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
	}

	if (!CanFold)
	return SDValue();

	assert(V1.getNode() && "Expected at least two non-zero elements!");
	if (V1.getSimpleValueType() != MVT::v4f32)
	V1 = DAG.getBitcast(MVT::v4f32, V1);
	if (V2.getSimpleValueType() != MVT::v4f32)
	V2 = DAG.getBitcast(MVT::v4f32, V2);

	// Ok, we can emit an INSERTPS instruction.
	unsigned ZMask = Zeroable.to_ulong();

	unsigned InsertPSMask = EltMaskIdx << 6 \| EltIdx << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	SDLoc DL(Op);
	SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getIntPtrConstant(InsertPSMask, DL));
	return DAG.getBitcast(VT, Result);
	}

	/// Return a vector logical shift node.
	static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
	SelectionDAG &DAG, const TargetLowering &TLI,
	const SDLoc &dl) {
	assert(VT.is128BitVector() && "Unknown type for VShift");
	MVT ShVT = MVT::v16i8;
	unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
	SrcOp = DAG.getBitcast(ShVT, SrcOp);
	MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	assert(NumBits % 8 == 0 && "Only support byte sized shifts");
	SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
	return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
	}

	static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
	SelectionDAG &DAG) {

	// Check if the scalar load can be widened into a vector load. And if
	// the address is "base + cst" see if the cst can be "absorbed" into
	// the shuffle mask.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
	SDValue Ptr = LD->getBasePtr();
	if (!ISD::isNormalLoad(LD) \|\| LD->isVolatile())
	return SDValue();
	EVT PVT = LD->getValueType(0);
	if (PVT != MVT::i32 && PVT != MVT::f32)
	return SDValue();

	int FI = -1;
	int64_t Offset = 0;
	if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
	FI = FINode->getIndex();
	Offset = 0;
	} else if (DAG.isBaseWithConstantOffset(Ptr) &&
	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
	FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	Offset = Ptr.getConstantOperandVal(1);
	Ptr = Ptr.getOperand(0);
	} else {
	return SDValue();
	}

	// FIXME: 256-bit vector instructions don't require a strict alignment,
	// improve this code to support it better.
	unsigned RequiredAlign = VT.getSizeInBits()/8;
	SDValue Chain = LD->getChain();
	// Make sure the stack object alignment is at least 16 or 32.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
	if (MFI.isFixedObjectIndex(FI)) {
	// Can't change the alignment. FIXME: It's possible to compute
	// the exact stack offset and reference FI + adjust offset instead.
	// If someone really cares about this. That's the way to implement it.
	return SDValue();
	} else {
	MFI.setObjectAlignment(FI, RequiredAlign);
	}
	}

	// (Offset % 16 or 32) must be multiple of 4. Then address is then
	// Ptr + (Offset & ~15).
	if (Offset < 0)
	return SDValue();
	if ((Offset % RequiredAlign) & 3)
	return SDValue();
	int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
	if (StartOffset) {
	SDLoc DL(Ptr);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
	}

	int EltNo = (Offset - StartOffset) >> 2;
	unsigned NumElems = VT.getVectorNumElements();

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
	SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
	LD->getPointerInfo().getWithOffset(StartOffset));

	SmallVector<int, 8> Mask(NumElems, EltNo);

	return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
	}

	return SDValue();
	}

	/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
	/// elements can be replaced by a single large load which has the same value as
	/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
	///
	/// Example: <load i32 a, load i32 a+4, zero, undef> -> zextload a
	static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	unsigned NumElems = Elts.size();

	int LastLoadedElt = -1;
	SmallBitVector LoadMask(NumElems, false);
	SmallBitVector ZeroMask(NumElems, false);
	SmallBitVector UndefMask(NumElems, false);

	// For each element in the initializer, see if we've found a load, zero or an
	// undef.
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (!Elt.getNode())
	return SDValue();

	if (Elt.isUndef())
	UndefMask[i] = true;
	else if (X86::isZeroNode(Elt) \|\| ISD::isBuildVectorAllZeros(Elt.getNode()))
	ZeroMask[i] = true;
	else if (ISD::isNON_EXTLoad(Elt.getNode())) {
	LoadMask[i] = true;
	LastLoadedElt = i;
	// Each loaded element must be the correct fractional portion of the
	// requested vector load.
	if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
	return SDValue();
	} else
	return SDValue();
	}
	assert((ZeroMask \| UndefMask \| LoadMask).count() == NumElems &&
	"Incomplete element masks");

	// Handle Special Cases - all undef or undef/zero.
	if (UndefMask.count() == NumElems)
	return DAG.getUNDEF(VT);

	// FIXME: Should we return this as a BUILD_VECTOR instead?
	if ((ZeroMask \| UndefMask).count() == NumElems)
	return VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	int FirstLoadedElt = LoadMask.find_first();
	SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
	LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
	EVT LDBaseVT = EltBase.getValueType();

	// Consecutive loads can contain UNDEFS but not ZERO elements.
	// Consecutive loads with UNDEFs and ZEROs elements require a
	// an additional shuffle stage to clear the ZERO elements.
	bool IsConsecutiveLoad = true;
	bool IsConsecutiveLoadWithZeros = true;
	for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
	if (LoadMask[i]) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	LoadSDNode *LD = cast<LoadSDNode>(Elt);
	if (!DAG.areNonVolatileConsecutiveLoads(
	LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
	i - FirstLoadedElt)) {
	IsConsecutiveLoad = false;
	IsConsecutiveLoadWithZeros = false;
	break;
	}
	} else if (ZeroMask[i]) {
	IsConsecutiveLoad = false;
	}
	}

	SmallVector<LoadSDNode *, 8> Loads;
	for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
	if (LoadMask[i])
	Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));

	auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
	auto MMOFlags = LDBase->getMemOperand()->getFlags();
	assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
	"Cannot merge volatile loads.");
	SDValue NewLd =
	DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
	LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
	for (auto *LD : Loads)
	DAG.makeEquivalentMemoryOrdering(LD, NewLd);
	return NewLd;
	};

	// LOAD - all consecutive load/undefs (must start/end with a load).
	// If we have found an entire vector of loads and undefs, then return a large
	// load of the entire vector width starting at the base pointer.
	// If the vector contains zeros, then attempt to shuffle those elements.
	if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
	(IsConsecutiveLoad \|\| IsConsecutiveLoadWithZeros)) {
	assert(LDBase && "Did not find base load for merging consecutive loads");
	EVT EltVT = LDBase->getValueType(0);
	// Ensure that the input vector size for the merged loads matches the
	// cumulative size of the input elements.
	if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
	return SDValue();

	if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
	return SDValue();

	// Don't create 256-bit non-temporal aligned loads without AVX2 as these
	// will lower to regular temporal loads and use the cache.
	if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
	VT.is256BitVector() && !Subtarget.hasInt256())
	return SDValue();

	if (IsConsecutiveLoad)
	return CreateLoad(VT, LDBase);

	// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
	// vector and a zero vector to clear out the zero elements.
	if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
	SmallVector<int, 4> ClearMask(NumElems, -1);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (ZeroMask[i])
	ClearMask[i] = i + NumElems;
	else if (LoadMask[i])
	ClearMask[i] = i;
	}
	SDValue V = CreateLoad(VT, LDBase);
	SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);
	return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
	}
	}

	int LoadSize =
	(1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();

	// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
	if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
	(LoadSize == 32 \|\| LoadSize == 64) &&
	((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()))) {
	MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
	: MVT::getIntegerVT(LoadSize);
	MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
	if (TLI.isTypeLegal(VecVT)) {
	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
	SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
	SDValue ResNode =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
	LDBase->getPointerInfo(),
	LDBase->getAlignment(),
	MachineMemOperand::MOLoad);
	for (auto *LD : Loads)
	DAG.makeEquivalentMemoryOrdering(LD, ResNode);
	return DAG.getBitcast(VT, ResNode);
	}
	}

	return SDValue();
	}

	static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
	unsigned SplatBitSize, LLVMContext &C) {
	unsigned ScalarSize = VT.getScalarSizeInBits();
	unsigned NumElm = SplatBitSize / ScalarSize;

	SmallVector<Constant *, 32> ConstantVec;
	for (unsigned i = 0; i < NumElm; i++) {
	APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
	Constant *Const;
	if (VT.isFloatingPoint()) {
	if (ScalarSize == 32) {
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
	} else {
	assert(ScalarSize == 64 && "Unsupported floating point scalar size");
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
	}
	} else
	Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
	ConstantVec.push_back(Const);
	}
	return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
	}

	static bool isUseOfShuffle(SDNode *N) {
	for (auto *U : N->uses()) {
	if (isTargetShuffle(U->getOpcode()))
	return true;
	if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
	return isUseOfShuffle(U);
	}
	return false;
	}

	// Check if the current node of build vector is a zero extended vector.
	// // If so, return the value extended.
	// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
	// // NumElt - return the number of zero extended identical values.
	// // EltType - return the type of the value include the zero extend.
	static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
	unsigned &NumElt, MVT &EltType) {
	SDValue ExtValue = Op->getOperand(0);
	unsigned NumElts = Op->getNumOperands();
	unsigned Delta = NumElts;

	for (unsigned i = 1; i < NumElts; i++) {
	if (Op->getOperand(i) == ExtValue) {
	Delta = i;
	break;
	}
	if (!(Op->getOperand(i).isUndef() \|\| isNullConstant(Op->getOperand(i))))
	return SDValue();
	}
	if (!isPowerOf2_32(Delta) \|\| Delta == 1)
	return SDValue();

	for (unsigned i = Delta; i < NumElts; i++) {
	if (i % Delta == 0) {
	if (Op->getOperand(i) != ExtValue)
	return SDValue();
	} else if (!(isNullConstant(Op->getOperand(i)) \|\|
	Op->getOperand(i).isUndef()))
	return SDValue();
	}
	unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
	unsigned ExtVTSize = EltSize * Delta;
	EltType = MVT::getIntegerVT(ExtVTSize);
	NumElt = NumElts / Delta;
	return ExtValue;
	}

	/// Attempt to use the vbroadcast instruction to generate a splat value
	/// from a splat BUILD_VECTOR which uses:
	/// a. A single scalar load, or a constant.
	/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
	///
	/// The VBROADCAST node is returned when a pattern is found,
	/// or SDValue() otherwise.
	static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// VBROADCAST requires AVX.
	// TODO: Splats could be generated for non-AVX CPUs using SSE
	// instructions, but there's less potential gain for only 128-bit vectors.
	if (!Subtarget.hasAVX())
	return SDValue();

	MVT VT = BVOp->getSimpleValueType(0);
	SDLoc dl(BVOp);

	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Unsupported vector type for broadcast.");

	BitVector UndefElements;
	SDValue Ld = BVOp->getSplatValue(&UndefElements);

	// Attempt to use VBROADCASTM
	// From this paterrn:
	// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
	// b. t1 = (build_vector t0 t0)
	//
	// Create (VBROADCASTM v2i1 X)
	if (Subtarget.hasCDI() && (VT.is512BitVector() \|\| Subtarget.hasVLX())) {
	MVT EltType = VT.getScalarType();
	unsigned NumElts = VT.getVectorNumElements();
	SDValue BOperand;
	SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
	if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) \|\|
	(Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
	Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
	if (ZeroExtended)
	BOperand = ZeroExtended.getOperand(0);
	else
	BOperand = Ld.getOperand(0).getOperand(0);
	if (BOperand.getValueType().isVector() &&
	BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
	if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 \|\|
	NumElts == 8)) \|\| // for broadcastmb2q
	(EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 \|\|
	NumElts == 16))) { // for broadcastmw2d
	SDValue Brdcst =
	DAG.getNode(X86ISD::VBROADCASTM, dl,
	MVT::getVectorVT(EltType, NumElts), BOperand);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}
	}

	// We need a splat of a single value to use broadcast, and it doesn't
	// make any sense if the value is only in one element of the vector.
	if (!Ld \|\| (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
	APInt SplatValue, Undef;
	unsigned SplatBitSize;
	bool HasUndef;
	// Check if this is a repeated constant pattern suitable for broadcasting.
	if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
	SplatBitSize > VT.getScalarSizeInBits() &&
	SplatBitSize < VT.getSizeInBits()) {
	// Avoid replacing with broadcast when it's a use of a shuffle
	// instruction to preserve the present custom lowering of shuffles.
	if (isUseOfShuffle(BVOp) \|\| BVOp->hasOneUse())
	return SDValue();
	// replace BUILD_VECTOR with broadcast of the repeated constants.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	LLVMContext *Ctx = DAG.getContext();
	MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
	if (Subtarget.hasAVX()) {
	if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
	!(SplatBitSize == 64 && Subtarget.is32Bit())) {
	// Splatted value can fit in one INTEGER constant in constant pool.
	// Load the constant and broadcast it.
	MVT CVT = MVT::getIntegerVT(SplatBitSize);
	Type ScalarTy = Type::getIntNTy(Ctx, SplatBitSize);
	Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize == 32 \|\| SplatBitSize == 64) {
	// Splatted value can fit in one FLOAT constant in constant pool.
	// Load the constant and broadcast it.
	// AVX have support for 32 and 64 bit broadcast for floats only.
	// No 64bit integer in 32bit subtarget.
	MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
	// Lower the splat via APFloat directly, to avoid any conversion.
	Constant *C =
	SplatBitSize == 32
	? ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEsingle(), SplatValue))
	: ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEdouble(), SplatValue));
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize > 64) {
	// Load the vector of constants and broadcast it.
	MVT CVT = VT.getScalarType();
	Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
	*Ctx);
	SDValue VCP = DAG.getConstantPool(VecC, PVT);
	unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
	unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
	Ld = DAG.getLoad(
	MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}
	return SDValue();
	}

	bool ConstSplatVal =
	(Ld.getOpcode() == ISD::Constant \|\| Ld.getOpcode() == ISD::ConstantFP);

	// Make sure that all of the users of a non-constant load are from the
	// BUILD_VECTOR node.
	if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
	return SDValue();

	unsigned ScalarSize = Ld.getValueSizeInBits();
	bool IsGE256 = (VT.getSizeInBits() >= 256);

	// When optimizing for size, generate up to 5 extra bytes for a broadcast
	// instruction to save 8 or more bytes of constant pool data.
	// TODO: If multiple splats are generated to load the same constant,
	// it may be detrimental to overall size. There needs to be a way to detect
	// that condition to know if this is truly a size win.
	bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();

	// Handle broadcasting a single constant scalar from the constant pool
	// into a vector.
	// On Sandybridge (no AVX2), it is still better to load a constant vector
	// from the constant pool and not to broadcast it from a scalar.
	// But override that restriction when optimizing for size.
	// TODO: Check if splatting is recommended for other AVX-capable CPUs.
	if (ConstSplatVal && (Subtarget.hasAVX2() \|\| OptForSize)) {
	EVT CVT = Ld.getValueType();
	assert(!CVT.isVector() && "Must not broadcast a vector type");

	// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
	// For size optimization, also splat v2f64 and v2i64, and for size opt
	// with AVX2, also splat i8 and i16.
	// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(OptForSize && (ScalarSize == 64 \|\| Subtarget.hasAVX2()))) {
	const Constant *C = nullptr;
	if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
	C = CI->getConstantIntValue();
	else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
	C = CF->getConstantFPValue();

	assert(C && "Invalid constant type");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue CP =
	DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);

	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}
	}

	bool IsLoad = ISD::isNormalLoad(Ld.getNode());

	// Handle AVX2 in-register broadcasts.
	if (!IsLoad && Subtarget.hasInt256() &&
	(ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64)))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The scalar source must be a normal load.
	if (!IsLoad)
	return SDValue();

	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(Subtarget.hasVLX() && ScalarSize == 64))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The integer check is needed for the 64-bit into 128-bit so it doesn't match
	// double since there is no vbroadcastsd xmm
	if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
	if (ScalarSize == 8 \|\| ScalarSize == 16 \|\| ScalarSize == 64)
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}

	// Unsupported broadcast.
	return SDValue();
	}

	/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
	/// underlying vector and index.
	///
	/// Modifies \p ExtractedFromVec to the real vector and returns the real
	/// index.
	static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
	SDValue ExtIdx) {
	int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
	if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
	return Idx;

	// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
	// lowered this:
	// (extract_vector_elt (v8f32 %1), Constant<6>)
	// to:
	// (extract_vector_elt (vector_shuffle<2,u,u,u>
	// (extract_subvector (v8f32 %0), Constant<4>),
	// undef)
	// Constant<0>)
	// In this case the vector is the extract_subvector expression and the index
	// is 2, as specified by the shuffle.
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
	SDValue ShuffleVec = SVOp->getOperand(0);
	MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
	assert(ShuffleVecVT.getVectorElementType() ==
	ExtractedFromVec.getSimpleValueType().getVectorElementType());

	int ShuffleIdx = SVOp->getMaskElt(Idx);
	if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
	ExtractedFromVec = ShuffleVec;
	return ShuffleIdx;
	}
	return Idx;
	}

	static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// Skip if insert_vec_elt is not supported.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
	return SDValue();

	SDLoc DL(Op);
	unsigned NumElems = Op.getNumOperands();

	SDValue VecIn1;
	SDValue VecIn2;
	SmallVector<unsigned, 4> InsertIndices;
	SmallVector<int, 8> Mask(NumElems, -1);

	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Opc = Op.getOperand(i).getOpcode();

	if (Opc == ISD::UNDEF)
	continue;

	if (Opc != ISD::EXTRACT_VECTOR_ELT) {
	// Quit if more than 1 elements need inserting.
	if (InsertIndices.size() > 1)
	return SDValue();

	InsertIndices.push_back(i);
	continue;
	}

	SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
	SDValue ExtIdx = Op.getOperand(i).getOperand(1);

	// Quit if non-constant index.
	if (!isa<ConstantSDNode>(ExtIdx))
	return SDValue();
	int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

	// Quit if extracted from vector of different type.
	if (ExtractedFromVec.getValueType() != VT)
	return SDValue();

	if (!VecIn1.getNode())
	VecIn1 = ExtractedFromVec;
	else if (VecIn1 != ExtractedFromVec) {
	if (!VecIn2.getNode())
	VecIn2 = ExtractedFromVec;
	else if (VecIn2 != ExtractedFromVec)
	// Quit if more than 2 vectors to shuffle
	return SDValue();
	}

	if (ExtractedFromVec == VecIn1)
	Mask[i] = Idx;
	else if (ExtractedFromVec == VecIn2)
	Mask[i] = Idx + NumElems;
	}

	if (!VecIn1.getNode())
	return SDValue();

	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
	SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

	for (unsigned Idx : InsertIndices)
	NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
	DAG.getIntPtrConstant(Idx, DL));

	return NV;
	}

	static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
	Op.getScalarValueSizeInBits() == 1 &&
	"Can not convert non-constant vector");
	uint64_t Immediate = 0;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (!In.isUndef())
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	}
	SDLoc dl(Op);
	MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
	return DAG.getConstant(Immediate, dl, VT);
	}
	// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
	static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	MVT VT = Op.getSimpleValueType();
	assert((VT.getVectorElementType() == MVT::i1) &&
	"Unexpected type in LowerBUILD_VECTORvXi1!");

	SDLoc dl(Op);
	if (ISD::isBuildVectorAllZeros(Op.getNode()))
	return Op;

	if (ISD::isBuildVectorAllOnes(Op.getNode()))
	return Op;

	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	// Split the pieces.
	SDValue Lower =
	DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
	SDValue Upper =
	DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
	// We have to manually lower both halves so getNode doesn't try to
	// reassemble the build_vector.
	Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
	Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
	}
	SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, Imm);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Vector has one or more non-const elements
	uint64_t Immediate = 0;
	SmallVector<unsigned, 16> NonConstIdx;
	bool IsSplat = true;
	bool HasConstElts = false;
	int SplatIdx = -1;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (In.isUndef())
	continue;
	if (!isa<ConstantSDNode>(In))
	NonConstIdx.push_back(idx);
	else {
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	HasConstElts = true;
	}
	if (SplatIdx < 0)
	SplatIdx = idx;
	else if (In != Op.getOperand(SplatIdx))
	IsSplat = false;
	}

	// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
	if (IsSplat)
	return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
	DAG.getConstant(1, dl, VT),
	DAG.getConstant(0, dl, VT));

	// insert elements one by one
	SDValue DstVec;
	SDValue Imm;
	if (Immediate) {
	MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
	Imm = DAG.getConstant(Immediate, dl, ImmVT);
	}
	else if (HasConstElts)
	Imm = DAG.getConstant(0, dl, VT);
	else
	Imm = DAG.getUNDEF(VT);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	DstVec = DAG.getBitcast(VT, Imm);
	else {
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
	unsigned InsertIdx = NonConstIdx[i];
	DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	Op.getOperand(InsertIdx),
	DAG.getIntPtrConstant(InsertIdx, dl));
	}
	return DstVec;
	}

	/// \brief Return true if \p N implements a horizontal binop and return the
	/// operands for the horizontal binop into V0 and V1.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function checks that the build_vector \p N in input implements a
	/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
	/// operation to match.
	/// For example, if \p Opcode is equal to ISD::ADD, then this function
	/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
	/// is equal to ISD::SUB, then this function checks if this is a horizontal
	/// arithmetic sub.
	///
	/// This function only analyzes elements of \p N whose indices are
	/// in range [BaseIdx, LastIdx).
	static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
	SelectionDAG &DAG,
	unsigned BaseIdx, unsigned LastIdx,
	SDValue &V0, SDValue &V1) {
	EVT VT = N->getValueType(0);

	assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
	assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
	"Invalid Vector in input!");

	bool IsCommutable = (Opcode == ISD::ADD \|\| Opcode == ISD::FADD);
	bool CanFold = true;
	unsigned ExpectedVExtractIdx = BaseIdx;
	unsigned NumElts = LastIdx - BaseIdx;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// Check if N implements a horizontal binop.
	for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
	SDValue Op = N->getOperand(i + BaseIdx);

	// Skip UNDEFs.
	if (Op->isUndef()) {
	// Update the expected vector extract index.
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	ExpectedVExtractIdx += 2;
	continue;
	}

	CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

	if (!CanFold)
	break;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
	CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op0.getOperand(0) == Op1.getOperand(0) &&
	isa<ConstantSDNode>(Op0.getOperand(1)) &&
	isa<ConstantSDNode>(Op1.getOperand(1)));
	if (!CanFold)
	break;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();

	if (i * 2 < NumElts) {
	if (V0.isUndef()) {
	V0 = Op0.getOperand(0);
	if (V0.getValueType() != VT)
	return false;
	}
	} else {
	if (V1.isUndef()) {
	V1 = Op0.getOperand(0);
	if (V1.getValueType() != VT)
	return false;
	}
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	}

	SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
	if (I0 == ExpectedVExtractIdx)
	CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
	else if (IsCommutable && I1 == ExpectedVExtractIdx) {
	// Try to match the following dag sequence:
	// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
	CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
	} else
	CanFold = false;

	ExpectedVExtractIdx += 2;
	}

	return CanFold;
	}

	/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
	/// a concat_vector.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function expects two 256-bit vectors called V0 and V1.
	/// At first, each vector is split into two separate 128-bit vectors.
	/// Then, the resulting 128-bit vectors are used to implement two
	/// horizontal binary operations.
	///
	/// The kind of horizontal binary operation is defined by \p X86Opcode.
	///
	/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
	/// the two new horizontal binop.
	/// When Mode is set, the first horizontal binop dag node would take as input
	/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
	/// horizontal binop dag node would take as input the lower 128-bit of V1
	/// and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V0_HI
	/// HADD V1_LO, V1_HI
	///
	/// Otherwise, the first horizontal binop dag node takes as input the lower
	/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
	/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V1_LO
	/// HADD V0_HI, V1_HI
	///
	/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
	/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
	/// the upper 128-bits of the result.
	static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
	const SDLoc &DL, SelectionDAG &DAG,
	unsigned X86Opcode, bool Mode,
	bool isUndefLO, bool isUndefHI) {
	MVT VT = V0.getSimpleValueType();
	assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
	"Invalid nodes in input!");

	unsigned NumElts = VT.getVectorNumElements();
	SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
	SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
	SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
	SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
	MVT NewVT = V0_LO.getSimpleValueType();

	SDValue LO = DAG.getUNDEF(NewVT);
	SDValue HI = DAG.getUNDEF(NewVT);

	if (Mode) {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && !V0->isUndef())
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
	if (!isUndefHI && !V1->isUndef())
	HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
	} else {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && (!V0_LO->isUndef() \|\| !V1_LO->isUndef()))
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

	if (!isUndefHI && (!V0_HI->isUndef() \|\| !V1_HI->isUndef()))
	HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
	}

	/// Returns true iff \p BV builds a vector with the result equivalent to
	/// the result of ADDSUB operation.
	/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	static bool isAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1,
	unsigned &NumExtracts) {

	MVT VT = BV->getSimpleValueType(0);
	if ((!Subtarget.hasSSE3() \|\| (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
	(!Subtarget.hasAVX() \|\| (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
	(!Subtarget.hasAVX512() \|\| (VT != MVT::v16f32 && VT != MVT::v8f64)))
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	SDValue InVec0 = DAG.getUNDEF(VT);
	SDValue InVec1 = DAG.getUNDEF(VT);

	NumExtracts = 0;

	// Odd-numbered elements in the input build vector are obtained from
	// adding two integer/float elements.
	// Even-numbered elements in the input build vector are obtained from
	// subtracting two integer/float elements.
	unsigned ExpectedOpcode = ISD::FSUB;
	unsigned NextExpectedOpcode = ISD::FADD;
	bool AddFound = false;
	bool SubFound = false;

	for (unsigned i = 0, e = NumElts; i != e; ++i) {
	SDValue Op = BV->getOperand(i);

	// Skip 'undef' values.
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::UNDEF) {
	std::swap(ExpectedOpcode, NextExpectedOpcode);
	continue;
	}

	// Early exit if we found an unexpected opcode.
	if (Opcode != ExpectedOpcode)
	return false;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
	// Early exit if we cannot match that sequence.
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\|
	Op0.getOperand(1) != Op1.getOperand(1))
	return false;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	if (I0 != i)
	return false;

	// We found a valid add/sub node. Update the information accordingly.
	if (i & 1)
	AddFound = true;
	else
	SubFound = true;

	// Update InVec0 and InVec1.
	if (InVec0.isUndef()) {
	InVec0 = Op0.getOperand(0);
	if (InVec0.getSimpleValueType() != VT)
	return false;
	}
	if (InVec1.isUndef()) {
	InVec1 = Op1.getOperand(0);
	if (InVec1.getSimpleValueType() != VT)
	return false;
	}

	// Make sure that operands in input to each add/sub node always
	// come from a same pair of vectors.
	if (InVec0 != Op0.getOperand(0)) {
	if (ExpectedOpcode == ISD::FSUB)
	return false;

	// FADD is commutable. Try to commute the operands
	// and then test again.
	std::swap(Op0, Op1);
	if (InVec0 != Op0.getOperand(0))
	return false;
	}

	if (InVec1 != Op1.getOperand(0))
	return false;

	// Update the pair of expected opcodes.
	std::swap(ExpectedOpcode, NextExpectedOpcode);

	// Increment the number of extractions done.
	++NumExtracts;
	}

	// Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
	if (!AddFound \|\| !SubFound \|\| InVec0.isUndef() \|\| InVec1.isUndef())
	return false;

	Opnd0 = InVec0;
	Opnd1 = InVec1;
	return true;
	}

	/// Returns true if is possible to fold MUL and an idiom that has already been
	/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
	/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
	/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
	///
	/// Prior to calling this function it should be known that there is some
	/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
	/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
	/// before replacement of such SDNode with ADDSUB operation. Thus the number
	/// of \p Opnd0 uses is expected to be equal to 2.
	/// For example, this function may be called for the following IR:
	/// %AB = fmul fast <2 x double> %A, %B
	/// %Sub = fsub fast <2 x double> %AB, %C
	/// %Add = fadd fast <2 x double> %AB, %C
	/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
	/// <2 x i32> <i32 0, i32 3>
	/// There is a def for %Addsub here, which potentially can be replaced by
	/// X86ISD::ADDSUB operation:
	/// %Addsub = X86ISD::ADDSUB %AB, %C
	/// and such ADDSUB can further be replaced with FMADDSUB:
	/// %Addsub = FMADDSUB %A, %B, %C.
	///
	/// The main reason why this method is called before the replacement of the
	/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
	/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
	/// FMADDSUB is.
	static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
	SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
	unsigned ExpectedUses) {
	if (Opnd0.getOpcode() != ISD::FMUL \|\|
	!Opnd0->hasNUsesOfValue(ExpectedUses, 0) \|\| !Subtarget.hasAnyFMA())
	return false;

	// FIXME: These checks must match the similar ones in
	// DAGCombiner::visitFADDForFMACombine. It would be good to have one
	// function that would answer if it is Ok to fuse MUL + ADD to FMADD
	// or MUL + ADDSUB to FMADDSUB.
	const TargetOptions &Options = DAG.getTarget().Options;
	bool AllowFusion =
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath);
	if (!AllowFusion)
	return false;

	Opnd2 = Opnd1;
	Opnd1 = Opnd0.getOperand(1);
	Opnd0 = Opnd0.getOperand(0);

	return true;
	}

	/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
	/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
	static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	unsigned NumExtracts;
	if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts))
	return SDValue();

	MVT VT = BV->getSimpleValueType(0);
	SDLoc DL(BV);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	// TODO: According to coverage reports, the FMADDSUB transform is not
	// triggered by any tests.
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))
	return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
	// recognition.
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
	static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = BV->getSimpleValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumUndefsLO = 0;
	unsigned NumUndefsHI = 0;
	unsigned Half = NumElts/2;

	// Count the number of UNDEF operands in the build_vector in input.
	for (unsigned i = 0, e = Half; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsLO++;

	for (unsigned i = Half, e = NumElts; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsHI++;

	// Early exit if this is either a build_vector of all UNDEFs or all the
	// operands but one are UNDEF.
	if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
	return SDValue();

	SDLoc DL(BV);
	SDValue InVec0, InVec1;
	if ((VT == MVT::v4f32 \|\| VT == MVT::v2f64) && Subtarget.hasSSE3()) {
	// Try to match an SSE3 float HADD/HSUB.
	if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
	} else if ((VT == MVT::v4i32 \|\| VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
	// Try to match an SSSE3 integer HADD/HSUB.
	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
	}

	if (!Subtarget.hasAVX())
	return SDValue();

	if ((VT == MVT::v8f32 \|\| VT == MVT::v4f64)) {
	// Try to match an AVX horizontal add/sub of packed single/double
	// precision floating point values from 256-bit vectors.
	SDValue InVec2, InVec3;
	if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
	} else if (VT == MVT::v8i32 \|\| VT == MVT::v16i16) {
	// Try to match an AVX2 horizontal add/sub of signed integers.
	SDValue InVec2, InVec3;
	unsigned X86Opcode;
	bool CanFold = true;

	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HSUB;
	else
	CanFold = false;

	if (CanFold) {
	// Fold this build_vector into a single horizontal add/sub.
	// Do this only if the target has AVX2.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);

	// Do not try to expand this build_vector into a pair of horizontal
	// add/sub if we can emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into a pair of horizontal binop followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
	isUndefLO, isUndefHI);
	}
	}

	if ((VT == MVT::v8f32 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8i32 \|\|
	VT == MVT::v16i16) && Subtarget.hasAVX()) {
	unsigned X86Opcode;
	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HSUB;
	else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::FHADD;
	else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::FHSUB;
	else
	return SDValue();

	// Don't try to expand this build_vector into a pair of horizontal add/sub
	// if we can simply emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into two horizontal add/sub followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
	isUndefLO, isUndefHI);
	}

	return SDValue();
	}

	/// If a BUILD_VECTOR's source elements all apply the same bit operation and
	/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
	/// just apply the bit to the vectors.
	/// NOTE: Its not in our interest to start make a general purpose vectorizer
	/// from this, but enough scalar bit operations are created from the later
	/// legalization + scalarization stages to need basic support.
	static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
	SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op->getSimpleValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Check that all elements have the same opcode.
	// TODO: Should we allow UNDEFS and if so how many?
	unsigned Opcode = Op->getOperand(0).getOpcode();
	for (unsigned i = 1; i < NumElems; ++i)
	if (Opcode != Op->getOperand(i).getOpcode())
	return SDValue();

	// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
	switch (Opcode) {
	default:
	return SDValue();
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	// Don't do this if the buildvector is a splat - we'd replace one
	// constant with an entire vector.
	if (Op->getSplatValue())
	return SDValue();
	if (!TLI.isOperationLegalOrPromote(Opcode, VT))
	return SDValue();
	break;
	}

	SmallVector<SDValue, 4> LHSElts, RHSElts;
	for (SDValue Elt : Op->ops()) {
	SDValue LHS = Elt.getOperand(0);
	SDValue RHS = Elt.getOperand(1);

	// We expect the canonicalized RHS operand to be the constant.
	if (!isa<ConstantSDNode>(RHS))
	return SDValue();
	LHSElts.push_back(LHS);
	RHSElts.push_back(RHS);
	}

	SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
	SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	}

	/// Create a vector constant without a load. SSE/AVX provide the bare minimum
	/// functionality to do this, so it's all zeros, all ones, or some derivation
	/// that is cheap to calculate.
	static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();

	// Vectors containing all zeros can be matched by pxor and xorps.
	if (ISD::isBuildVectorAllZeros(Op.getNode())) {
	// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
	// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32)
	return Op;

	return getZeroVector(VT, Subtarget, DAG, DL);
	}

	// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
	// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
	// vpcmpeqd on 256-bit vectors.
	if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
	if (VT == MVT::v4i32 \|\| VT == MVT::v16i32 \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()))
	return Op;

	return getOnesVector(VT, DAG, DL);
	}

	return SDValue();
	}

	// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
	// reasoned to be a permutation of a vector by indices in a non-constant vector.
	// (build_vector (extract_elt V, (extract_elt I, 0)),
	// (extract_elt V, (extract_elt I, 1)),
	// ...
	// ->
	// (vpermv I, V)
	//
	// TODO: Handle undefs
	// TODO: Utilize pshufb and zero mask blending to support more efficient
	// construction of vectors with constant-0 elements.
	// TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
	// when no native operation available.
	static SDValue
	LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Look for VPERMV and PSHUFB opportunities.
	MVT VT = V.getSimpleValueType();
	switch (VT.SimpleTy) {
	default:
	return SDValue();
	case MVT::v16i8:
	if (!Subtarget.hasSSE3())
	return SDValue();
	break;
	case MVT::v8f32:
	case MVT::v8i32:
	if (!Subtarget.hasAVX2())
	return SDValue();
	break;
	case MVT::v4i64:
	case MVT::v4f64:
	if (!Subtarget.hasVLX())
	return SDValue();
	break;
	case MVT::v16f32:
	case MVT::v8f64:
	case MVT::v16i32:
	case MVT::v8i64:
	if (!Subtarget.hasAVX512())
	return SDValue();
	break;
	case MVT::v32i16:
	if (!Subtarget.hasBWI())
	return SDValue();
	break;
	case MVT::v8i16:
	case MVT::v16i16:
	if (!Subtarget.hasVLX() \|\| !Subtarget.hasBWI())
	return SDValue();
	break;
	case MVT::v64i8:
	if (!Subtarget.hasVBMI())
	return SDValue();
	break;
	case MVT::v32i8:
	if (!Subtarget.hasVLX() \|\| !Subtarget.hasVBMI())
	return SDValue();
	break;
	}
	SDValue SrcVec, IndicesVec;
	// Check for a match of the permute source vector and permute index elements.
	// This is done by checking that the i-th build_vector operand is of the form:
	// (extract_elt SrcVec, (extract_elt IndicesVec, i)).
	for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
	SDValue Op = V.getOperand(Idx);
	if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract encountered in V, set the source vector,
	// otherwise verify the extract is from the previously defined source
	// vector.
	if (!SrcVec)
	SrcVec = Op.getOperand(0);
	else if (SrcVec != Op.getOperand(0))
	return SDValue();
	SDValue ExtractedIndex = Op->getOperand(1);
	// Peek through extends.
	if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND \|\|
	ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
	ExtractedIndex = ExtractedIndex.getOperand(0);
	if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract from the index vector candidate, set the
	// indices vector, otherwise verify the extract is from the previously
	// defined indices vector.
	if (!IndicesVec)
	IndicesVec = ExtractedIndex.getOperand(0);
	else if (IndicesVec != ExtractedIndex.getOperand(0))
	return SDValue();

	auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
	if (!PermIdx \|\| PermIdx->getZExtValue() != Idx)
	return SDValue();
	}
	MVT IndicesVT = VT;
	if (VT.isFloatingPoint())
	IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
	VT.getVectorNumElements());
	IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
	if (SrcVec.getValueSizeInBits() < IndicesVT.getSizeInBits()) {
	SrcVec =
	DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(SrcVec), VT, DAG.getUNDEF(VT),
	SrcVec, DAG.getIntPtrConstant(0, SDLoc(SrcVec)));
	}
	if (VT == MVT::v16i8)
	return DAG.getNode(X86ISD::PSHUFB, SDLoc(V), VT, SrcVec, IndicesVec);
	return DAG.getNode(X86ISD::VPERMV, SDLoc(V), VT, IndicesVec, SrcVec);
	}

	SDValue
	X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);

	MVT VT = Op.getSimpleValueType();
	MVT ExtVT = VT.getVectorElementType();
	unsigned NumElems = Op.getNumOperands();

	// Generate vectors for predicate vectors.
	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
	return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

	if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
	return VectorConstant;

	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
	// TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB
	// transform here.
	if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
	return AddSub;
	if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
	return HorizontalOp;
	if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
	return Broadcast;
	if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
	return BitOp;

	unsigned EVTBits = ExtVT.getSizeInBits();

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	bool IsAllConstants = true;
	SmallSet<SDValue, 8> Values;
	unsigned NumConstants = NumElems;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (Elt.isUndef())
	continue;
	Values.insert(Elt);
	if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
	IsAllConstants = false;
	NumConstants--;
	}
	if (X86::isZeroNode(Elt))
	NumZero++;
	else {
	assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
	NonZeros \|= ((uint64_t)1 << i);
	NumNonZero++;
	}
	}

	// All undef vector. Return an UNDEF. All zero vectors were handled above.
	if (NumNonZero == 0)
	return DAG.getUNDEF(VT);

	// If we are inserting one variable into a vector of non-zero constants, try
	// to avoid loading each constant element as a scalar. Load the constants as a
	// vector and then insert the variable scalar element. If insertion is not
	// supported, we assume that we will fall back to a shuffle to get the scalar
	// blended with the constants. Insertion into a zero vector is handled as a
	// special-case somewhere below here.
	LLVMContext &Context = *DAG.getContext();
	if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
	(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) \|\|
	isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
	// Create an all-constant vector. The variable element in the old
	// build vector is replaced by undef in the constant vector. Save the
	// variable scalar element and its index for use in the insertelement.
	Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
	SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
	SDValue VarElt;
	SDValue InsIndex;
	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (auto *C = dyn_cast<ConstantSDNode>(Elt))
	ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
	else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
	ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
	else if (!Elt.isUndef()) {
	assert(!VarElt.getNode() && !InsIndex.getNode() &&
	"Expected one variable element in this vector");
	VarElt = Elt;
	InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
	}
	}
	Constant *CV = ConstantVector::get(ConstVecOps);
	SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

	// The constants we just created may not be legal (eg, floating point). We
	// must lower the vector right here because we can not guarantee that we'll
	// legalize it before loading it. This is also why we could not just create
	// a new build vector here. If the build vector contains illegal constants,
	// it could get split back up into a series of insert elements.
	// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
	SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
	MachineFunction &MF = DAG.getMachineFunction();
	MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
	SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
	}

	// Special case for single non-zero, non-undef, element.
	if (NumNonZero == 1) {
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);

	// If this is an insertion of an i64 value on x86-32, and if the top bits of
	// the value are obviously zero, truncate the value to i32 and do the
	// insertion that way. Only do this if the value is non-constant or if the
	// value is a constant being inserted into element 0. It is cheaper to do
	// a constant pool load than it is to do a movd + shuffle.
	if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
	(!IsAllConstants \|\| Idx == 0)) {
	if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
	// Handle SSE only.
	assert(VT == MVT::v2i64 && "Expected an SSE value type!");
	MVT VecVT = MVT::v4i32;

	// Truncate the value (which may itself be a constant) to i32, and
	// convert it to a vector with movd (S2V+shuffle to zero extend).
	Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
	return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
	Item, Idx * 2, true, Subtarget, DAG));
	}
	}

	// If we have a constant or non-constant insertion into the low element of
	// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
	// the rest of the elements. This will be matched as movd/movq/movss/movsd
	// depending on what the source datatype is.
	if (Idx == 0) {
	if (NumZero == 0)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

	if (ExtVT == MVT::i32 \|\| ExtVT == MVT::f32 \|\| ExtVT == MVT::f64 \|\|
	(ExtVT == MVT::i64 && Subtarget.is64Bit())) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\|
	VT.is512BitVector()) &&
	"Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
	return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}

	// We can't directly insert an i8 or i16 into a vector, so zero extend
	// it to i32 first.
	if (ExtVT == MVT::i16 \|\| ExtVT == MVT::i8) {
	Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
	if (VT.getSizeInBits() >= 256) {
	MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
	if (Subtarget.hasAVX()) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	} else {
	// Without AVX, we need to extend to a 128-bit vector and then
	// insert into the 256-bit vector.
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
	Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
	}
	} else {
	assert(VT.is128BitVector() && "Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}
	return DAG.getBitcast(VT, Item);
	}
	}

	// Is it a vector logical left shift?
	if (NumElems == 2 && Idx == 1 &&
	X86::isZeroNode(Op.getOperand(0)) &&
	!X86::isZeroNode(Op.getOperand(1))) {
	unsigned NumBits = VT.getSizeInBits();
	return getVShift(true, VT,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	VT, Op.getOperand(1)),
	NumBits/2, DAG, *this, dl);
	}

	if (IsAllConstants) // Otherwise, it's better to do a constpool load.
	return SDValue();

	// Otherwise, if this is a vector with i32 or f32 elements, and the element
	// is a non-constant being inserted into an element other than the low one,
	// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
	// movd/movss) to move this into the low element, then shuffle it into
	// place.
	if (EVTBits == 32) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
	}
	}

	// Splat is obviously ok. Let legalizer expand it to a shuffle.
	if (Values.size() == 1) {
	if (EVTBits == 32) {
	// Instead of a shuffle like this:
	// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
	// Check if it's possible to issue this instead.
	// shuffle (vload ptr)), undef, <1, 1, 1, 1>
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);
	if (Op.getNode()->isOnlyUserOf(Item.getNode()))
	return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
	}
	return SDValue();
	}

	// A vector full of immediates; various special cases are already
	// handled, so this is best done with a single constant-pool load.
	if (IsAllConstants)
	return SDValue();

	if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
	return V;

	// See if we can use a vector load to get all of the elements.
	if (VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) {
	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
	return LD;
	}

	// For AVX-length vectors, build the individual 128-bit pieces and use
	// shuffles to put them in place.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2);

	// Build both the lower and upper subvector.
	SDValue Lower =
	DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
	SDValue Upper = DAG.getBuildVector(
	HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

	// Recreate the wider vector with the lower and upper part.
	if (VT.is256BitVector())
	return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
	return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
	}

	// Let legalizer expand 2-wide build_vectors.
	if (EVTBits == 64) {
	if (NumNonZero == 1) {
	// One half is zero or undef.
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
	Op.getOperand(Idx));
	return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
	}
	return SDValue();
	}

	// If element VT is < 32 bits, convert it to inserts into a zero vector.
	if (EVTBits == 8 && NumElems == 16)
	if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	if (EVTBits == 16 && NumElems == 8)
	if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
	if (EVTBits == 32 && NumElems == 4)
	if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
	return V;

	// If element VT is == 32 bits, turn it into a number of shuffles.
	if (NumElems == 4 && NumZero > 0) {
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < 4; ++i) {
	bool isZero = !(NonZeros & (1ULL << i));
	if (isZero)
	Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
	else
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	}

	for (unsigned i = 0; i < 2; ++i) {
	switch ((NonZeros >> (i*2)) & 0x3) {
	default: llvm_unreachable("Unexpected NonZero count");
	case 0:
	Ops[i] = Ops[i*2]; // Must be a zero vector.
	break;
	case 1:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2+1], Ops[i2]);
	break;
	case 2:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	case 3:
	Ops[i] = getUnpackl(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	}
	}

	bool Reverse1 = (NonZeros & 0x3) == 2;
	bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
	int MaskVec[] = {
	Reverse1 ? 1 : 0,
	Reverse1 ? 0 : 1,
	static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
	static_cast<int>(Reverse2 ? NumElems : NumElems+1)
	};
	return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
	}

	assert(Values.size() > 1 && "Expected non-undef and non-splat vector");

	// Check for a build vector from mostly shuffle plus few inserting.
	if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
	return Sh;

	// For SSE 4.1, use insertps to put the high elements into the low element.
	if (Subtarget.hasSSE41()) {
	SDValue Result;
	if (!Op.getOperand(0).isUndef())
	Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
	else
	Result = DAG.getUNDEF(VT);

	for (unsigned i = 1; i < NumElems; ++i) {
	if (Op.getOperand(i).isUndef()) continue;
	Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	return Result;
	}

	// Otherwise, expand into a number of unpckl*, start by extending each of
	// our (non-undef) elements to the full vector width with the element in the
	// bottom slot of the vector (which generates no code for SSE).
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (!Op.getOperand(i).isUndef())
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	else
	Ops[i] = DAG.getUNDEF(VT);
	}

	// Next, we iteratively mix elements, e.g. for v4f32:
	// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
	// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
	// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
	for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
	// Generate scaled UNPCKL shuffle mask.
	SmallVector<int, 16> Mask;
	for(unsigned i = 0; i != Scale; ++i)
	Mask.push_back(i);
	for (unsigned i = 0; i != Scale; ++i)
	Mask.push_back(NumElems+i);
	Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

	for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
	Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2i], Ops[(2i)+1], Mask);
	}
	return Ops[0];
	}

	// 256-bit AVX can use the vinsertf128 instruction
	// to create 256-bit vectors from two other 128-bit ones.
	static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();

	assert((ResVT.is256BitVector() \|\|
	ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");

	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	unsigned NumElems = ResVT.getVectorNumElements();
	if (ResVT.is256BitVector())
	return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);

	if (Op.getNumOperands() == 4) {
	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	SDValue V3 = Op.getOperand(2);
	SDValue V4 = Op.getOperand(3);
	return concat256BitVectors(
	concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
	concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
	NumElems, DAG, dl);
	}
	return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
	}

	// Return true if all the operands of the given CONCAT_VECTORS node are zeros
	// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
	static bool isExpandWithZeros(const SDValue &Op) {
	assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
	"Expand with zeros only possible in CONCAT_VECTORS nodes!");

	for (unsigned i = 1; i < Op.getNumOperands(); i++)
	if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
	return false;

	return true;
	}

	// Returns true if the given node is a type promotion (by concatenating i1
	// zeros) of the result of a node that already zeros all upper bits of
	// k-register.
	static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
	unsigned Opc = Op.getOpcode();

	assert(Opc == ISD::CONCAT_VECTORS &&
	Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Unexpected node to check for type promotion!");

	// As long as we are concatenating zeros to the upper part of a previous node
	// result, climb up the tree until a node with different opcode is
	// encountered
	while (Opc == ISD::INSERT_SUBVECTOR \|\| Opc == ISD::CONCAT_VECTORS) {
	if (Opc == ISD::INSERT_SUBVECTOR) {
	if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
	Op.getConstantOperandVal(2) == 0)
	Op = Op.getOperand(1);
	else
	return SDValue();
	} else { // Opc == ISD::CONCAT_VECTORS
	if (isExpandWithZeros(Op))
	Op = Op.getOperand(0);
	else
	return SDValue();
	}
	Opc = Op.getOpcode();
	}

	// Check if the first inserted node zeroes the upper bits, or an 'and' result
	// of a node that zeros the upper bits (its masked version).
	if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) \|\|
	(Op.getOpcode() == ISD::AND &&
	(isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) \|\|
	isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG & DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();
	unsigned NumOperands = Op.getNumOperands();

	assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
	"Unexpected number of operands in CONCAT_VECTORS");

	// If this node promotes - by concatenating zeroes - the type of the result
	// of a node with instruction that zeroes all upper (irrelevant) bits of the
	// output register, mark it as legal and catch the pattern in instruction
	// selection to avoid emitting extra instructions (for zeroing upper bits).
	if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
	SDValue ZeroC = DAG.getIntPtrConstant(0, dl);
	SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
	ZeroC);
	}

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	for (unsigned i = 0; i != NumOperands; ++i) {
	SDValue SubVec = Op.getOperand(i);
	if (SubVec.isUndef())
	continue;
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	++NumZero;
	else {
	assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
	NonZeros \|= (uint64_t)1 << i;
	++NumNonZero;
	}
	}


	// If there are zero or one non-zeros we can handle this very simply.
	if (NumNonZero <= 1) {
	SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
	: DAG.getUNDEF(ResVT);
	if (!NumNonZero)
	return Vec;
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue SubVec = Op.getOperand(Idx);
	unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
	DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
	}

	if (NumOperands > 2) {
	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	ArrayRef<SDUse> Ops = Op->ops();
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(0, NumOperands/2));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(NumOperands/2));
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	assert(NumNonZero == 2 && "Simple cases not handled?");

	if (ResVT.getVectorNumElements() >= 16)
	return Op; // The operation is legal with KUNPCK

	SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
	DAG.getUNDEF(ResVT), Op.getOperand(0),
	DAG.getIntPtrConstant(0, dl));
	unsigned NumElems = ResVT.getVectorNumElements();
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
	DAG.getIntPtrConstant(NumElems/2, dl));
	}

	static SDValue LowerCONCAT_VECTORS(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getVectorElementType() == MVT::i1)
	return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

	assert((VT.is256BitVector() && Op.getNumOperands() == 2) \|\|
	(VT.is512BitVector() && (Op.getNumOperands() == 2 \|\|
	Op.getNumOperands() == 4)));

	// AVX can use the vinsertf128 instruction to create 256-bit vectors
	// from two other 128-bit ones.

	// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
	return LowerAVXCONCAT_VECTORS(Op, DAG);
	}

	//===----------------------------------------------------------------------===//
	// Vector shuffle lowering
	//
	// This is an experimental code path for lowering vector shuffles on x86. It is
	// designed to handle arbitrary vector shuffles and blends, gracefully
	// degrading performance as necessary. It works hard to recognize idiomatic
	// shuffles and lower them to optimal instruction patterns without leaving
	// a framework that allows reasonably efficient handling of all vector shuffle
	// patterns.
	//===----------------------------------------------------------------------===//

	/// \brief Tiny helper function to identify a no-op mask.
	///
	/// This is a somewhat boring predicate function. It checks whether the mask
	/// array input, which is assumed to be a single-input shuffle mask of the kind
	/// used by the X86 shuffle instructions (not a fully general
	/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
	/// in-place shuffle are 'no-op's.
	static bool isNoopShuffleMask(ArrayRef<int> Mask) {
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != i)
	return false;
	}
	return true;
	}

	/// \brief Test whether there are elements crossing 128-bit lanes in this
	/// shuffle mask.
	///
	/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
	/// and we routinely test for these.
	static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
	int LaneSize = 128 / VT.getScalarSizeInBits();
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	return true;
	return false;
	}

	/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
	///
	/// This checks a shuffle mask to see if it is performing the same
	/// lane-relative shuffle in each sub-lane. This trivially implies
	/// that it is also not lane-crossing. It may however involve a blend from the
	/// same lane of a second vector.
	///
	/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
	/// non-trivial to compute in the face of undef lanes. The representation is
	/// suitable for use with existing 128-bit shuffles as entries from the second
	/// vector have been remapped to [LaneSize, 2*LaneSize).
	static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, -1);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] == SM_SentinelUndef \|\| Mask[i] >= 0);
	if (Mask[i] < 0)
	continue;
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
	: Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] < 0)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Test whether a shuffle mask is equivalent within each 128-bit lane.
	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	/// Test whether a shuffle mask is equivalent within each 256-bit lane.
	static bool
	is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
	}

	/// Test whether a target shuffle mask is equivalent within each sub-lane.
	/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
	static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, SM_SentinelUndef);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(isUndefOrZero(Mask[i]) \|\| (Mask[i] >= 0));
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] == SM_SentinelZero) {
	if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
	return false;
	RepeatedMask[i % LaneSize] = SM_SentinelZero;
	continue;
	}
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM =
	Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
	/// arguments.
	///
	/// This is a fast way to test a shuffle mask against a fixed pattern:
	///
	/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
	///
	/// It returns true if the mask is exactly as wide as the argument list, and
	/// each element of the mask is either -1 (signifying undef) or the value given
	/// in the argument.
	static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	if (Mask.size() != ExpectedMask.size())
	return false;

	int Size = Mask.size();

	// If the values are build vectors, we can look through them to find
	// equivalent inputs that make the shuffles equivalent.
	auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
	auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);

	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
	if (!MaskBV \|\| !ExpectedBV \|\|
	MaskBV->getOperand(Mask[i] % Size) !=
	ExpectedBV->getOperand(ExpectedMask[i] % Size))
	return false;
	}
	}

	return true;
	}

	/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
	///
	/// The masks must be exactly the same width.
	///
	/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
	/// value in ExpectedMask is always accepted. Otherwise the indices must match.
	///
	/// SM_SentinelZero is accepted as a valid negative index but must match in both.
	static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	int Size = Mask.size();
	if (Size != (int)ExpectedMask.size())
	return false;

	for (int i = 0; i < Size; ++i)
	if (Mask[i] == SM_SentinelUndef)
	continue;
	else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
	return false;
	else if (Mask[i] != ExpectedMask[i])
	return false;

	return true;
	}

	// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
	// mask.
	static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
	const APInt &Zeroable) {
	int NumElts = Mask.size();
	assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");

	SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
	TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
	}
	return TargetMask;
	}

	// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
	// instructions.
	static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
	if (VT != MVT::v8i32 && VT != MVT::v8f32)
	return false;

	SmallVector<int, 8> Unpcklwd;
	createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
	/* Unary = */ false);
	SmallVector<int, 8> Unpckhwd;
	createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
	/* Unary = */ false);
	bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) \|\|
	isTargetShuffleEquivalent(Mask, Unpckhwd));
	return IsUnpackwdMask;
	}

	/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
	///
	/// This helper function produces an 8-bit shuffle immediate corresponding to
	/// the ubiquitous shuffle encoding scheme used in x86 instructions for
	/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
	/// example.
	///
	/// NB: We rely heavily on "undef" masks preserving the input lane.
	static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
	assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
	assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");

	unsigned Imm = 0;
	Imm \|= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
	Imm \|= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
	Imm \|= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
	Imm \|= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
	return Imm;
	}

	static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
	SelectionDAG &DAG) {
	return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
	}

	/// \brief Compute whether each element of a shuffle is zeroable.
	///
	/// A "zeroable" vector shuffle element is one which can be lowered to zero.
	/// Either it is an undef element in the shuffle mask, the element of the input
	/// referenced is undef, or the element of the input referenced is known to be
	/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
	/// as many lanes with this technique as possible to simplify the remaining
	/// shuffle.
	static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
	SDValue V1, SDValue V2) {
	APInt Zeroable(Mask.size(), 0);
	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

	int VectorSizeInBits = V1.getValueSizeInBits();
	int ScalarSizeInBits = VectorSizeInBits / Mask.size();
	assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];
	// Handle the easy cases.
	if (M < 0 \|\| (M >= 0 && M < Size && V1IsZero) \|\| (M >= Size && V2IsZero)) {
	Zeroable.setBit(i);
	continue;
	}

	// Determine shuffle input and normalize the mask.
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
	if (V.getOpcode() != ISD::BUILD_VECTOR)
	continue;

	// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
	// the (larger) source element must be UNDEF/ZERO.
	if ((Size % V.getNumOperands()) == 0) {
	int Scale = Size / V->getNumOperands();
	SDValue Op = V.getOperand(M / Scale);
	if (Op.isUndef() \|\| X86::isZeroNode(Op))
	Zeroable.setBit(i);
	else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt Val = Cst->getAPIntValue();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt Val = Cst->getValueAPF().bitcastToAPInt();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	}
	continue;
	}

	// If the BUILD_VECTOR has more elements then all the (smaller) source
	// elements must be UNDEF or ZERO.
	if ((V.getNumOperands() % Size) == 0) {
	int Scale = V->getNumOperands() / Size;
	bool AllZeroable = true;
	for (int j = 0; j < Scale; ++j) {
	SDValue Op = V.getOperand((M * Scale) + j);
	AllZeroable &= (Op.isUndef() \|\| X86::isZeroNode(Op));
	}
	if (AllZeroable)
	Zeroable.setBit(i);
	continue;
	}
	}

	return Zeroable;
	}

	// The Shuffle result is as follow:
	// 0a[0]0a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
	// Each Zeroable's element correspond to a particular Mask's element.
	// As described in computeZeroableShuffleElements function.
	//
	// The function looks for a sub-mask that the nonzero elements are in
	// increasing order. If such sub-mask exist. The function returns true.
	static bool isNonZeroElementsInOrder(const APInt &Zeroable,
	ArrayRef<int> Mask, const EVT &VectorType,
	bool &IsZeroSideLeft) {
	int NextElement = -1;
	// Check if the Mask's nonzero elements are in increasing order.
	for (int i = 0, e = Mask.size(); i < e; i++) {
	// Checks if the mask's zeros elements are built from only zeros.
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] < 0)
	return false;
	if (Zeroable[i])
	continue;
	// Find the lowest non zero element
	if (NextElement < 0) {
	NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
	IsZeroSideLeft = NextElement != 0;
	}
	// Exit if the mask's non zero elements are not in increasing order.
	if (NextElement != Mask[i])
	return false;
	NextElement++;
	}
	return true;
	}

	/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
	static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	const int NumBytes = VT.getSizeInBits() / 8;
	const int NumEltBytes = VT.getScalarSizeInBits() / 8;

	assert((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\|
	(Subtarget.hasAVX2() && VT.is256BitVector()) \|\|
	(Subtarget.hasBWI() && VT.is512BitVector()));

	SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
	// Sign bit set in i8 mask means zero element.
	SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

	SDValue V;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / NumEltBytes];
	if (M < 0) {
	PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
	continue;
	}
	if (Zeroable[i / NumEltBytes]) {
	PSHUFBMask[i] = ZeroMask;
	continue;
	}

	// We can only use a single input of V1 or V2.
	SDValue SrcV = (M >= Size ? V2 : V1);
	if (V && V != SrcV)
	return SDValue();
	V = SrcV;
	M %= Size;

	// PSHUFB can't cross lanes, ensure this doesn't happen.
	if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
	return SDValue();

	M = M % LaneSize;
	M = M * NumEltBytes + (i % NumEltBytes);
	PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
	}
	assert(V && "Failed to find a source input");

	MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
	DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
	}

	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl);

	// X86 has dedicated shuffle that can be lowered to VEXPAND
	static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SDValue &V1,
	SDValue &V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsLeftZeroSide = true;
	if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
	IsLeftZeroSide))
	return SDValue();
	unsigned VEXPANDMask = (~Zeroable).getZExtValue();
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
	unsigned NumElts = VT.getVectorNumElements();
	assert((NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) &&
	"Unexpected number of vector elements");
	SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
	Subtarget, DAG, DL);
	SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
	return DAG.getSelect(DL, VT, VMask,
	DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
	ZeroVector);
	}

	static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &UnpackOpcode, bool IsUnary,
	ArrayRef<int> TargetMask, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();

	bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
	for (int i = 0; i != NumElts; i += 2) {
	int M1 = TargetMask[i + 0];
	int M2 = TargetMask[i + 1];
	Undef1 &= (SM_SentinelUndef == M1);
	Undef2 &= (SM_SentinelUndef == M2);
	Zero1 &= isUndefOrZero(M1);
	Zero2 &= isUndefOrZero(M2);
	}
	assert(!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) &&
	"Zeroable shuffle detected");

	// Attempt to match the target mask against the unpack lo/hi mask patterns.
	SmallVector<int, 64> Unpckl, Unpckh;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
	if (IsUnary && (Zero1 \|\| Zero2)) {
	// Don't bother if we can blend instead.
	if ((Subtarget.hasSSE41() \|\| VT == MVT::v2i64 \|\| VT == MVT::v2f64) &&
	isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
	return false;

	bool MatchLo = true, MatchHi = true;
	for (int i = 0; (i != NumElts) && (MatchLo \|\| MatchHi); ++i) {
	int M = TargetMask[i];

	// Ignore if the input is known to be zero or the index is undef.
	if ((((i & 1) == 0) && Zero1) \|\| (((i & 1) == 1) && Zero2) \|\|
	(M == SM_SentinelUndef))
	continue;

	MatchLo &= (M == Unpckl[i]);
	MatchHi &= (M == Unpckh[i]);
	}

	if (MatchLo \|\| MatchHi) {
	UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	return true;
	}
	}

	// If a binary shuffle, commute and try again.
	if (!IsUnary) {
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	std::swap(V1, V2);
	return true;
	}

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	std::swap(V1, V2);
	return true;
	}
	}

	return false;
	}

	// X86 has dedicated unpack instructions that can handle specific blend
	// operations: UNPCKH and UNPCKL.
	static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SmallVector<int, 8> Unpckl;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = / true, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

	SmallVector<int, 8> Unpckh;
	createUnpackShuffleMask(VT, Unpckh, /* Lo = / false, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

	// Commute and try again.
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

	return SDValue();
	}

	// X86 has dedicated pack instructions that can handle specific truncation
	// operations: PACKSS and PACKUS.
	static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
	SDValue &V2, unsigned &PackOpcode,
	ArrayRef<int> TargetMask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NumElts = VT.getVectorNumElements();
	unsigned BitSize = VT.getScalarSizeInBits();
	MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
	MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);

	auto MatchPACK = [&](SDValue N1, SDValue N2) {
	SDValue VV1 = DAG.getBitcast(PackVT, N1);
	SDValue VV2 = DAG.getBitcast(PackVT, N2);
	if ((N1.isUndef() \|\| DAG.ComputeNumSignBits(VV1) > BitSize) &&
	(N2.isUndef() \|\| DAG.ComputeNumSignBits(VV2) > BitSize)) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKSS;
	return true;
	}

	if (Subtarget.hasSSE41() \|\| PackSVT == MVT::i16) {
	APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
	if ((N1.isUndef() \|\| DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
	(N2.isUndef() \|\| DAG.MaskedValueIsZero(VV2, ZeroMask))) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKUS;
	return true;
	}
	}

	return false;
	};

	// Try binary shuffle.
	SmallVector<int, 32> BinaryMask;
	createPackShuffleMask(VT, BinaryMask, false);
	if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
	if (MatchPACK(V1, V2))
	return true;

	// Try unary shuffle.
	SmallVector<int, 32> UnaryMask;
	createPackShuffleMask(VT, UnaryMask, true);
	if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
	if (MatchPACK(V1, V1))
	return true;

	return false;
	}

	static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT PackVT;
	unsigned PackOpcode;
	if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
	Subtarget))
	return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
	DAG.getBitcast(PackVT, V2));

	return SDValue();
	}

	/// \brief Try to emit a bitmask instruction for a shuffle.
	///
	/// This handles cases where we can model a blend exactly as a bitmask due to
	/// one of the inputs being zeroable.
	static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() && "Floating point types are not supported");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
	SDValue V;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Zeroable[i])
	continue;
	if (Mask[i] % Size != i)
	return SDValue(); // Not a blend.
	if (!V)
	V = Mask[i] < Size ? V1 : V2;
	else if (V != (Mask[i] < Size ? V1 : V2))
	return SDValue(); // Can only let one input through the mask.

	VMaskOps[i] = AllOnes;
	}
	if (!V)
	return SDValue(); // No non-zeroable elements!

	SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
	return DAG.getNode(ISD::AND, DL, VT, V, VMask);
	}

	/// \brief Try to emit a blend instruction for a shuffle using bit math.
	///
	/// This is used as a fallback approach when first class blend instructions are
	/// unavailable. Currently it is only suitable for integer vectors, but could
	/// be generalized for floating point vectors if desirable.
	static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.isInteger() && "Only supports integer vector types!");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> MaskOps;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
	return SDValue(); // Shuffled input!
	MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
	}

	SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
	V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
	// We have to cast V2 around.
	MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
	V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
	DAG.getBitcast(MaskVT, V1Mask),
	DAG.getBitcast(MaskVT, V2)));
	return DAG.getNode(ISD::OR, DL, VT, V1, V2);
	}

	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG);

	static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
	MutableArrayRef<int> TargetMask,
	bool &ForceV1Zero, bool &ForceV2Zero,
	uint64_t &BlendMask) {
	bool V1IsZeroOrUndef =
	V1.isUndef() \|\| ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZeroOrUndef =
	V2.isUndef() \|\| ISD::isBuildVectorAllZeros(V2.getNode());

	BlendMask = 0;
	ForceV1Zero = false, ForceV2Zero = false;
	assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");

	// Attempt to generate the binary blend mask. If an input is zero then
	// we can use any lane.
	// TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
	for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
	int M = TargetMask[i];
	if (M == SM_SentinelUndef)
	continue;
	if (M == i)
	continue;
	if (M == i + Size) {
	BlendMask \|= 1ull << i;
	continue;
	}
	if (M == SM_SentinelZero) {
	if (V1IsZeroOrUndef) {
	ForceV1Zero = true;
	TargetMask[i] = i;
	continue;
	}
	if (V2IsZeroOrUndef) {
	ForceV2Zero = true;
	BlendMask \|= 1ull << i;
	TargetMask[i] = i + Size;
	continue;
	}
	}
	return false;
	}
	return true;
	}

	static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
	int Scale) {
	uint64_t ScaledMask = 0;
	for (int i = 0; i != Size; ++i)
	if (BlendMask & (1ull << i))
	ScaledMask \|= ((1ull << Scale) - 1) << (i * Scale);
	return ScaledMask;
	}

	/// \brief Try to emit a blend instruction for a shuffle.
	///
	/// This doesn't do any checks for the availability of instructions for blending
	/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
	/// be matched in the backend with the type given. What it does check for is
	/// that the shuffle mask is a blend, or convertible into a blend with zero.
	static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Original,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);

	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
	BlendMask))
	return SDValue();

	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
	if (ForceV1Zero)
	V1 = getZeroVector(VT, Subtarget, DAG, DL);
	if (ForceV2Zero)
	V2 = getZeroVector(VT, Subtarget, DAG, DL);

	switch (VT.SimpleTy) {
	case MVT::v2f64:
	case MVT::v4f32:
	case MVT::v4f64:
	case MVT::v8f32:
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));

	case MVT::v4i64:
	case MVT::v8i32:
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v2i64:
	case MVT::v4i32:
	// If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
	// that instruction.
	if (Subtarget.hasAVX2()) {
	// Scale the blend by the number of 32-bit dwords per element.
	int Scale = VT.getScalarSizeInBits() / 32;
	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
	MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8)));
	}
	LLVM_FALLTHROUGH;
	case MVT::v8i16: {
	// For integer shuffles we need to expand the mask and cast the inputs to
	// v8i16s prior to blending.
	int Scale = 8 / VT.getVectorNumElements();
	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
	V1 = DAG.getBitcast(MVT::v8i16, V1);
	V2 = DAG.getBitcast(MVT::v8i16, V2);
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8)));
	}

	case MVT::v16i16: {
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
	assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
	BlendMask = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	BlendMask \|= 1ull << i;
	return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));
	}
	LLVM_FALLTHROUGH;
	}
	case MVT::v16i8:
	case MVT::v32i8: {
	assert((VT.is128BitVector() \|\| Subtarget.hasAVX2()) &&
	"256-bit byte-blends require AVX2 support!");

	if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}

	// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
	if (SDValue Masked =
	lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
	return Masked;

	// Scale the blend by the number of bytes per element.
	int Scale = VT.getScalarSizeInBits() / 8;

	// This form of blend is always done on bytes. Compute the byte vector
	// type.
	MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

	// Compute the VSELECT mask. Note that VSELECT is really confusing in the
	// mix of LLVM's code generator and the x86 backend. We tell the code
	// generator that boolean values in the elements of an x86 vector register
	// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
	// mapping a select to operand #1, and 'false' mapping to operand #2. The
	// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
	// of the element (the remaining are ignored) and 0 in that high bit would
	// mean operand #1 while 1 in the high bit would mean operand #2. So while
	// the LLVM model for boolean values in vector elements gets the relevant
	// bit set, it is set backwards and over constrained relative to x86's
	// actual model.
	SmallVector<SDValue, 32> VSELECTMask;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	for (int j = 0; j < Scale; ++j)
	VSELECTMask.push_back(
	Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
	: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
	MVT::i8));

	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT,
	DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
	V1, V2));
	}
	case MVT::v16f32:
	case MVT::v8f64:
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8: {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}
	default:
	llvm_unreachable("Not a supported integer vector type!");
	}
	}

	/// \brief Try to lower as a blend of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can blend elements from two inputs and
	/// then reduce the shuffle to a single-input permutation.
	static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	// We build up the blend mask while checking whether a blend is a viable way
	// to reduce the shuffle.
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	SmallVector<int, 32> PermuteMask(Mask.size(), -1);

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");

	if (BlendMask[Mask[i] % Size] < 0)
	BlendMask[Mask[i] % Size] = Mask[i];
	else if (BlendMask[Mask[i] % Size] != Mask[i])
	return SDValue(); // Can't blend in the needed input!

	PermuteMask[i] = Mask[i] % Size;
	}

	SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
	}

	/// \brief Generic routine to decompose a shuffle and blend into independent
	/// blends and permutes.
	///
	/// This matches the extremely common pattern for handling combined
	/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
	/// operations. It will try to pick the best arrangement of shuffles and
	/// blends.
	static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
	MVT VT, SDValue V1,
	SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	// Shuffle the input elements into the desired positions in V1 and V2 and
	// blend them together.
	SmallVector<int, 32> V1Mask(Mask.size(), -1);
	SmallVector<int, 32> V2Mask(Mask.size(), -1);
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] < Size) {
	V1Mask[i] = Mask[i];
	BlendMask[i] = i;
	} else if (Mask[i] >= Size) {
	V2Mask[i] = Mask[i] - Size;
	BlendMask[i] = i + Size;
	}

	// Try to lower with the simpler initial blend strategy unless one of the
	// input shuffles would be a no-op. We prefer to shuffle inputs as the
	// shuffle may be able to fold with a load or other benefit. However, when
	// we'll have to do 2x as many shuffles in order to achieve this, blending
	// first is a better strategy.
	if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
	if (SDValue BlendPerm =
	lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
	return BlendPerm;

	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
	return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	}

	/// \brief Try to lower a vector shuffle as a rotation.
	///
	/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
	static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	int NumElts = Mask.size();

	// We need to detect various ways of spelling a rotation:
	// [11, 12, 13, 14, 15, 0, 1, 2]
	// [-1, 12, 13, 14, -1, -1, 1, -1]
	// [-1, -1, -1, -1, -1, -1, 1, 2]
	// [ 3, 4, 5, 6, 7, 8, 9, 10]
	// [-1, 4, 5, 6, -1, -1, 9, -1]
	// [-1, 4, 5, 6, -1, -1, -1, -1]
	int Rotation = 0;
	SDValue Lo, Hi;
	for (int i = 0; i < NumElts; ++i) {
	int M = Mask[i];
	assert((M == SM_SentinelUndef \|\| (0 <= M && M < (2*NumElts))) &&
	"Unexpected mask index.");
	if (M < 0)
	continue;

	// Determine where a rotated vector would have started.
	int StartIdx = i - (M % NumElts);
	if (StartIdx == 0)
	// The identity rotation isn't interesting, stop.
	return -1;

	// If we found the tail of a vector the rotation must be the missing
	// front. If we found the head of a vector, it must be how much of the
	// head.
	int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

	if (Rotation == 0)
	Rotation = CandidateRotation;
	else if (Rotation != CandidateRotation)
	// The rotations don't match, so we can't match this mask.
	return -1;

	// Compute which value this mask is pointing at.
	SDValue MaskV = M < NumElts ? V1 : V2;

	// Compute which of the two target values this index should be assigned
	// to. This reflects whether the high elements are remaining or the low
	// elements are remaining.
	SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

	// Either set up this value if we've not encountered it before, or check
	// that it remains consistent.
	if (!TargetV)
	TargetV = MaskV;
	else if (TargetV != MaskV)
	// This may be a rotation, but it pulls from the inputs in some
	// unsupported interleaving.
	return -1;
	}

	// Check that we successfully analyzed the mask, and normalize the results.
	assert(Rotation != 0 && "Failed to locate a viable rotation!");
	assert((Lo \|\| Hi) && "Failed to find a rotated input vector!");
	if (!Lo)
	Lo = Hi;
	else if (!Hi)
	Hi = Lo;

	V1 = Lo;
	V2 = Hi;

	return Rotation;
	}

	/// \brief Try to lower a vector shuffle as a byte rotation.
	///
	/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
	/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
	/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
	/// try to generically lower a vector shuffle through such an pattern. It
	/// does not check for the profitability of lowering either as PALIGNR or
	/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
	/// This matches shuffle vectors that look like:
	///
	/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	// Don't accept any shuffles with zero elements.
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return -1;

	// PALIGNR works on 128-bit lanes.
	SmallVector<int, 16> RepeatedMask;
	if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
	return -1;

	int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
	if (Rotation <= 0)
	return -1;

	// PALIGNR rotates bytes, so we need to scale the
	// rotation based on how many bytes are in the vector lane.
	int NumElts = RepeatedMask.size();
	int Scale = 16 / NumElts;
	return Rotation * Scale;
	}

	static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

	SDValue Lo = V1, Hi = V2;
	int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
	if (ByteRotation <= 0)
	return SDValue();

	// Cast the inputs to i8 vector of correct length to match PALIGNR or
	// PSLLDQ/PSRLDQ.
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	Lo = DAG.getBitcast(ByteVT, Lo);
	Hi = DAG.getBitcast(ByteVT, Hi);

	// SSSE3 targets can use the palignr instruction.
	if (Subtarget.hasSSSE3()) {
	assert((!VT.is512BitVector() \|\| Subtarget.hasBWI()) &&
	"512-bit PALIGNR requires BWI instructions");
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
	DAG.getConstant(ByteRotation, DL, MVT::i8)));
	}

	assert(VT.is128BitVector() &&
	"Rotate-based lowering only supports 128-bit lowering!");
	assert(Mask.size() <= 16 &&
	"Can shuffle at most 16 bytes in a 128-bit vector!");
	assert(ByteVT == MVT::v16i8 &&
	"SSE2 rotate lowering only needed for v16i8!");

	// Default SSE2 implementation
	int LoByteShift = 16 - ByteRotation;
	int HiByteShift = ByteRotation;

	SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
	DAG.getConstant(LoByteShift, DL, MVT::i8));
	SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
	DAG.getConstant(HiByteShift, DL, MVT::i8));
	return DAG.getBitcast(VT,
	DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
	}

	/// \brief Try to lower a vector shuffle as a dword/qword rotation.
	///
	/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
	/// rotation of the concatenation of two vectors; This routine will
	/// try to generically lower a vector shuffle through such an pattern.
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) &&
	"Only 32-bit and 64-bit elements are supported!");

	// 128/256-bit vectors are only supported with VLX.
	assert((Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT.is256BitVector()))
	&& "VLX required for 128/256-bit vectors");

	SDValue Lo = V1, Hi = V2;
	int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
	if (Rotation <= 0)
	return SDValue();

	return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
	DAG.getConstant(Rotation, DL, MVT::i8));
	}

	/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
	///
	/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
	/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
	/// matches elements from one of the input vectors shuffled to the left or
	/// right with zeroable elements 'shifted in'. It handles both the strictly
	/// bit-wise element shifts and the byte shift across an entire 128-bit double
	/// quad word lane.
	///
	/// PSHL : (little-endian) left bit shift.
	/// [ zz, 0, zz, 2 ]
	/// [ -1, 4, zz, -1 ]
	/// PSRL : (little-endian) right bit shift.
	/// [ 1, zz, 3, zz]
	/// [ -1, -1, 7, zz]
	/// PSLLDQ : (little-endian) left byte shift
	/// [ zz, 0, 1, 2, 3, 4, 5, 6]
	/// [ zz, zz, -1, -1, 2, 3, 4, -1]
	/// [ zz, zz, zz, zz, zz, zz, -1, 1]
	/// PSRLDQ : (little-endian) right byte shift
	/// [ 5, 6, 7, zz, zz, zz, zz, zz]
	/// [ -1, 5, 6, 7, zz, zz, zz, zz]
	/// [ 1, 2, -1, -1, -1, -1, zz, zz]
	static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
	unsigned ScalarSizeInBits,
	ArrayRef<int> Mask, int MaskOffset,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget) {
	int Size = Mask.size();
	unsigned SizeInBits = Size * ScalarSizeInBits;

	auto CheckZeros = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i < Size; i += Scale)
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i != Size; i += Scale) {
	unsigned Pos = Left ? i + Shift : i;
	unsigned Low = Left ? i : i + Shift;
	unsigned Len = Scale - Shift;
	if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
	return -1;
	}

	int ShiftEltBits = ScalarSizeInBits * Scale;
	bool ByteShift = ShiftEltBits > 64;
	Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
	: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
	int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

	// Normalize the scale for byte shifts to still produce an i64 element
	// type.
	Scale = ByteShift ? Scale / 2 : Scale;

	// We need to round trip through the appropriate type for the shift.
	MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
	ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
	: MVT::getVectorVT(ShiftSVT, Size / Scale);
	return (int)ShiftAmt;
	};

	// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
	// keep doubling the size of the integer elements up to that. We can
	// then shift the elements of the integer vector by whole multiples of
	// their width within the elements of the larger integer vector. Test each
	// multiple to see if we can find a match with the moved element indices
	// and that the shifted in elements are all zeroable.
	unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
	for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
	for (int Shift = 1; Shift != Scale; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Scale, Left)) {
	int ShiftAmt = MatchShift(Shift, Scale, Left);
	if (0 < ShiftAmt)
	return ShiftAmt;
	}

	// no match
	return -1;
	}

	static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	MVT ShiftVT;
	SDValue V = V1;
	unsigned Opcode;

	// Try to match shuffle against V1 shift.
	int ShiftAmt = matchVectorShuffleAsShift(
	ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);

	// If V1 failed, try to match shuffle against V2 shift.
	if (ShiftAmt < 0) {
	ShiftAmt =
	matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, Size, Zeroable, Subtarget);
	V = V2;
	}

	if (ShiftAmt < 0)
	return SDValue();

	assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
	"Illegal integer vector type");
	V = DAG.getBitcast(ShiftVT, V);
	V = DAG.getNode(Opcode, DL, ShiftVT, V,
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getBitcast(VT, V);
	}

	// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
	// Remainder of lower half result is zero and upper half is all undef.
	static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx, const APInt &Zeroable) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
	assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");

	// Upper half must be undefined.
	if (!isUndefInRange(Mask, HalfSize, HalfSize))
	return false;

	// Determine the extraction length from the part of the
	// lower half that isn't zeroable.
	int Len = HalfSize;
	for (; Len > 0; --Len)
	if (!Zeroable[Len - 1])
	break;
	assert(Len > 0 && "Zeroable shuffle mask");

	// Attempt to match first Len sequential elements from the lower half.
	SDValue Src;
	int Idx = -1;
	for (int i = 0; i != Len; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	SDValue &V = (M < Size ? V1 : V2);
	M = M % Size;

	// The extracted elements must start at a valid index and all mask
	// elements must be in the lower half.
	if (i > M \|\| M >= HalfSize)
	return false;

	if (Idx < 0 \|\| (Src == V && Idx == (M - i))) {
	Src = V;
	Idx = M - i;
	continue;
	}
	return false;
	}

	if (!Src \|\| Idx < 0)
	return false;

	assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Src;
	return true;
	}

	// INSERTQ: Extract lowest Len elements from lower half of second source and
	// insert over first source, starting at Idx.
	// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
	static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	// Upper half must be undefined.
	if (!isUndefInRange(Mask, HalfSize, HalfSize))
	return false;

	for (int Idx = 0; Idx != HalfSize; ++Idx) {
	SDValue Base;

	// Attempt to match first source from mask before insertion point.
	if (isUndefInRange(Mask, 0, Idx)) {
	/* EMPTY */
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
	Base = V1;
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
	Base = V2;
	} else {
	continue;
	}

	// Extend the extraction length looking to match both the insertion of
	// the second source and the remaining elements of the first.
	for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
	SDValue Insert;
	int Len = Hi - Idx;

	// Match insertion.
	if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
	Insert = V1;
	} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
	Insert = V2;
	} else {
	continue;
	}

	// Match the remaining elements of the lower half.
	if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
	/* EMPTY */
	} else if ((!Base \|\| (Base == V1)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
	Base = V1;
	} else if ((!Base \|\| (Base == V2)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
	Size + Hi)) {
	Base = V2;
	} else {
	continue;
	}

	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Base;
	V2 = Insert;
	return true;
	}
	}

	return false;
	}

	/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
	static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	uint64_t BitLen, BitIdx;
	if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
	return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
	return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
	V2 ? V2 : DAG.getUNDEF(VT),
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	return SDValue();
	}

	/// \brief Lower a vector shuffle as a zero or any extension.
	///
	/// Given a specific number of elements, element bit width, and extension
	/// stride, produce either a zero or any extension based on the available
	/// features of the subtarget. The extended elements are consecutive and
	/// begin and can start from an offsetted element index in the input; to
	/// avoid excess shuffling the offset must either being in the bottom lane
	/// or at the start of a higher lane. All extended elements must be from
	/// the same lane.
	static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
	ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(Scale > 1 && "Need a scale to extend.");
	int EltBits = VT.getScalarSizeInBits();
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = 128 / EltBits;
	int OffsetLane = Offset / NumEltsPerLane;
	assert((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) &&
	"Only 8, 16, and 32 bit elements can be extended.");
	assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
	assert(0 <= Offset && "Extension offset must be positive.");
	assert((Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0) &&
	"Extension offset must be in the first lane or start an upper lane.");

	// Check that an index is in same lane as the base offset.
	auto SafeOffset = [&](int Idx) {
	return OffsetLane == (Idx / NumEltsPerLane);
	};

	// Shift along an input so that the offset base moves to the first element.
	auto ShuffleOffset = [&](SDValue V) {
	if (!Offset)
	return V;

	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = 0; i * Scale < NumElements; ++i) {
	int SrcIdx = i + Offset;
	ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
	}
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
	};

	// Found a valid zext mask! Try various lowering strategies based on the
	// input type and available ISA extensions.
	if (Subtarget.hasSSE41()) {
	// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
	// PUNPCK will catch this in a later shuffle match.
	if (Offset && Scale == 2 && VT.is128BitVector())
	return SDValue();
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
	NumElements / Scale);
	InputV = ShuffleOffset(InputV);
	InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
	return DAG.getBitcast(VT, InputV);
	}

	assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");

	// For any extends we can cheat for larger element sizes and use shuffle
	// instructions that can fold with a load and/or copy.
	if (AnyExt && EltBits == 32) {
	int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
	-1};
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}
	if (AnyExt && EltBits == 16 && Scale > 2) {
	int PSHUFDMask[4] = {Offset / 2, -1,
	SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
	InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	int PSHUFWMask[4] = {1, -1, -1, -1};
	unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
	return DAG.getBitcast(
	VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
	DAG.getBitcast(MVT::v8i16, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
	}

	// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
	// to 64-bits.
	if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
	assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
	assert(VT.is128BitVector() && "Unexpected vector width!");

	int LoIdx = Offset * EltBits;
	SDValue Lo = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(LoIdx, DL, MVT::i8)));

	if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) \|\|
	!SafeOffset(Offset + 1))
	return DAG.getBitcast(VT, Lo);

	int HiIdx = (Offset + 1) * EltBits;
	SDValue Hi = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(HiIdx, DL, MVT::i8)));
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
	}

	// If this would require more than 2 unpack instructions to expand, use
	// pshufb when available. We can only use more than 2 unpack instructions
	// when zero extending i8 elements which also makes it easier to use pshufb.
	if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
	assert(NumElements == 16 && "Unexpected byte vector width!");
	SDValue PSHUFBMask[16];
	for (int i = 0; i < 16; ++i) {
	int Idx = Offset + (i / Scale);
	PSHUFBMask[i] = DAG.getConstant(
	(i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
	}
	InputV = DAG.getBitcast(MVT::v16i8, InputV);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
	DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
	}

	// If we are extending from an offset, ensure we start on a boundary that
	// we can unpack from.
	int AlignToUnpack = Offset % (NumElements / Scale);
	if (AlignToUnpack) {
	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = AlignToUnpack; i < NumElements; ++i)
	ShMask[i - AlignToUnpack] = i;
	InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
	Offset -= AlignToUnpack;
	}

	// Otherwise emit a sequence of unpacks.
	do {
	unsigned UnpackLoHi = X86ISD::UNPCKL;
	if (Offset >= (NumElements / 2)) {
	UnpackLoHi = X86ISD::UNPCKH;
	Offset -= (NumElements / 2);
	}

	MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
	SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
	: getZeroVector(InputVT, Subtarget, DAG, DL);
	InputV = DAG.getBitcast(InputVT, InputV);
	InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
	Scale /= 2;
	EltBits *= 2;
	NumElements /= 2;
	} while (Scale > 1);
	return DAG.getBitcast(VT, InputV);
	}

	/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
	///
	/// This routine will try to do everything in its power to cleverly lower
	/// a shuffle which happens to match the pattern of a zero extend. It doesn't
	/// check for the profitability of this lowering, it tries to aggressively
	/// match this pattern. It will use all of the micro-architectural details it
	/// can to emit an efficient lowering. It handles both blends with all-zero
	/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
	/// masking out later).
	///
	/// The reason we have dedicated lowering for zext-style shuffles is that they
	/// are both incredibly common and often quite performance sensitive.
	static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Bits = VT.getSizeInBits();
	int NumLanes = Bits / 128;
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = NumElements / NumLanes;
	assert(VT.getScalarSizeInBits() <= 32 &&
	"Exceeds 32-bit integer zero extension limit");
	assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");

	// Define a helper function to check a particular ext-scale and lower to it if
	// valid.
	auto Lower = [&](int Scale) -> SDValue {
	SDValue InputV;
	bool AnyExt = true;
	int Offset = 0;
	int Matches = 0;
	for (int i = 0; i < NumElements; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue; // Valid anywhere but doesn't tell us anything.
	if (i % Scale != 0) {
	// Each of the extended elements need to be zeroable.
	if (!Zeroable[i])
	return SDValue();

	// We no longer are in the anyext case.
	AnyExt = false;
	continue;
	}

	// Each of the base elements needs to be consecutive indices into the
	// same input vector.
	SDValue V = M < NumElements ? V1 : V2;
	M = M % NumElements;
	if (!InputV) {
	InputV = V;
	Offset = M - (i / Scale);
	} else if (InputV != V)
	return SDValue(); // Flip-flopping inputs.

	// Offset must start in the lowest 128-bit lane or at the start of an
	// upper lane.
	// FIXME: Is it ever worth allowing a negative base offset?
	if (!((0 <= Offset && Offset < NumEltsPerLane) \|\|
	(Offset % NumEltsPerLane) == 0))
	return SDValue();

	// If we are offsetting, all referenced entries must come from the same
	// lane.
	if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
	return SDValue();

	if ((M % NumElements) != (Offset + (i / Scale)))
	return SDValue(); // Non-consecutive strided elements.
	Matches++;
	}

	// If we fail to find an input, we have a zero-shuffle which should always
	// have already been handled.
	// FIXME: Maybe handle this here in case during blending we end up with one?
	if (!InputV)
	return SDValue();

	// If we are offsetting, don't extend if we only match a single input, we
	// can always do better by using a basic PSHUF or PUNPCK.
	if (Offset != 0 && Matches < 2)
	return SDValue();

	return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
	DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
	};

	// The widest scale possible for extending is to a 64-bit integer.
	assert(Bits % 64 == 0 &&
	"The number of bits in a vector must be divisible by 64 on x86!");
	int NumExtElements = Bits / 64;

	// Each iteration, try extending the elements half as much, but into twice as
	// many elements.
	for (; NumExtElements < NumElements; NumExtElements *= 2) {
	assert(NumElements % NumExtElements == 0 &&
	"The input vector size must be divisible by the extended size.");
	if (SDValue V = Lower(NumElements / NumExtElements))
	return V;
	}

	// General extends failed, but 128-bit vectors may be able to use MOVQ.
	if (Bits != 128)
	return SDValue();

	// Returns one of the source operands if the shuffle can be reduced to a
	// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
	auto CanZExtLowHalf = [&]() {
	for (int i = NumElements / 2; i != NumElements; ++i)
	if (!Zeroable[i])
	return SDValue();
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
	return V1;
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
	return V2;
	return SDValue();
	};

	if (SDValue V = CanZExtLowHalf()) {
	V = DAG.getBitcast(MVT::v2i64, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
	return DAG.getBitcast(VT, V);
	}

	// No viable ext lowering found.
	return SDValue();
	}

	/// \brief Try to get a scalar value for a specific element of a vector.
	///
	/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
	static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
	SelectionDAG &DAG) {
	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	V = peekThroughBitcasts(V);

	// If the bitcasts shift the element size, we can't extract an equivalent
	// element from it.
	MVT NewVT = V.getSimpleValueType();
	if (!NewVT.isVector() \|\| NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
	// Ensure the scalar operand is the same size as the destination.
	// FIXME: Add support for scalar truncation where possible.
	SDValue S = V.getOperand(Idx);
	if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
	return DAG.getBitcast(EltVT, S);
	}

	return SDValue();
	}

	/// \brief Helper to test for a load that can be folded with x86 shuffles.
	///
	/// This is particularly important because the set of instructions varies
	/// significantly based on whether the operand is a load or not.
	static bool isShuffleFoldableLoad(SDValue V) {
	V = peekThroughBitcasts(V);
	return ISD::isNON_EXTLoad(V.getNode());
	}

	/// \brief Try to lower insertion of a single element into a zero vector.
	///
	/// This is a common pattern that we have especially efficient patterns to lower
	/// across all subtarget feature sets.
	static SDValue lowerVectorShuffleAsElementInsertion(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT ExtVT = VT;
	MVT EltVT = VT.getVectorElementType();

	int V2Index =
	find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
	Mask.begin();
	bool IsV1Zeroable = true;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (i != V2Index && !Zeroable[i]) {
	IsV1Zeroable = false;
	break;
	}

	// Check for a single input from a SCALAR_TO_VECTOR node.
	// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
	// all the smarts here sunk into that routine. However, the current
	// lowering of BUILD_VECTOR makes that nearly impossible until the old
	// vector shuffle lowering is dead.
	SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
	DAG);
	if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
	// We need to zext the scalar if it is smaller than an i32.
	V2S = DAG.getBitcast(EltVT, V2S);
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16) {
	// Using zext to expand a narrow element won't work for non-zero
	// insertions.
	if (!IsV1Zeroable)
	return SDValue();

	// Zero-extend directly to i32.
	ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
	V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
	}
	V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
	} else if (Mask[V2Index] != (int)Mask.size() \|\| EltVT == MVT::i8 \|\|
	EltVT == MVT::i16) {
	// Either not inserting from the low element of the input or the input
	// element size is too small to use VZEXT_MOVL to clear the high bits.
	return SDValue();
	}

	if (!IsV1Zeroable) {
	// If V1 can't be treated as a zero vector we have fewer options to lower
	// this. We can't support integer vectors or non-zero targets cheaply, and
	// the V1 elements can't be permuted in any way.
	assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
	if (!VT.isFloatingPoint() \|\| V2Index != 0)
	return SDValue();
	SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
	V1Mask[V2Index] = -1;
	if (!isNoopShuffleMask(V1Mask))
	return SDValue();
	if (!VT.is128BitVector())
	return SDValue();

	// Otherwise, use MOVSD or MOVSS.
	assert((EltVT == MVT::f32 \|\| EltVT == MVT::f64) &&
	"Only two types of floating point element types to handle!");
	return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
	ExtVT, V1, V2);
	}

	// This lowering only works for the low element with floating point vectors.
	if (VT.isFloatingPoint() && V2Index != 0)
	return SDValue();

	V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
	if (ExtVT != VT)
	V2 = DAG.getBitcast(VT, V2);

	if (V2Index != 0) {
	// If we have 4 or fewer lanes we can cheaply shuffle the element into
	// the desired position. Otherwise it is more efficient to do a vector
	// shift left. We know that we can do a vector shift left because all
	// the inputs are zero.
	if (VT.isFloatingPoint() \|\| VT.getVectorNumElements() <= 4) {
	SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
	V2Shuffle[V2Index] = 0;
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
	} else {
	V2 = DAG.getBitcast(MVT::v16i8, V2);
	V2 = DAG.getNode(
	X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
	DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
	DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
	DAG.getDataLayout(), VT)));
	V2 = DAG.getBitcast(VT, V2);
	}
	}
	return V2;
	}

	/// Try to lower broadcast of a single - truncated - integer element,
	/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
	///
	/// This assumes we have AVX2.
	static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
	SDValue V0, int BroadcastIdx,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"We can only lower integer broadcasts with AVX2!");

	EVT EltVT = VT.getVectorElementType();
	EVT V0VT = V0.getValueType();

	assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
	assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");

	EVT V0EltVT = V0VT.getVectorElementType();
	if (!V0EltVT.isInteger())
	return SDValue();

	const unsigned EltSize = EltVT.getSizeInBits();
	const unsigned V0EltSize = V0EltVT.getSizeInBits();

	// This is only a truncation if the original element type is larger.
	if (V0EltSize <= EltSize)
	return SDValue();

	assert(((V0EltSize % EltSize) == 0) &&
	"Scalar type sizes must all be powers of 2 on x86!");

	const unsigned V0Opc = V0.getOpcode();
	const unsigned Scale = V0EltSize / EltSize;
	const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

	if ((V0Opc != ISD::SCALAR_TO_VECTOR \|\| V0BroadcastIdx != 0) &&
	V0Opc != ISD::BUILD_VECTOR)
	return SDValue();

	SDValue Scalar = V0.getOperand(V0BroadcastIdx);

	// If we're extracting non-least-significant bits, shift so we can truncate.
	// Hopefully, we can fold away the trunc/srl/load into the broadcast.
	// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
	// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
	if (const int OffsetIdx = BroadcastIdx % Scale)
	Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
	DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));

	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
	}

	/// \brief Try to lower broadcast of a single element.
	///
	/// For convenience, this code also bundles all of the subtarget feature set
	/// filtering. While a little annoying to re-dispatch on type here, there isn't
	/// a convenient way to factor it out.
	static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) \|\|
	(Subtarget.hasAVX() && VT.isFloatingPoint()) \|\|
	(Subtarget.hasAVX2() && VT.isInteger())))
	return SDValue();

	// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
	// we can only broadcast from a register with AVX2.
	unsigned NumElts = Mask.size();
	unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: X86ISD::VBROADCAST;
	bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) \|\| Subtarget.hasAVX2();

	// Check that the mask is a broadcast.
	int BroadcastIdx = -1;
	for (int i = 0; i != (int)NumElts; ++i) {
	SmallVector<int, 8> BroadcastMask(NumElts, i);
	if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
	BroadcastIdx = i;
	break;
	}
	}

	if (BroadcastIdx < 0)
	return SDValue();
	assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
	"a sorted mask where the broadcast "
	"comes from V1.");

	// Go up the chain of (vector) values to find a scalar load that we can
	// combine with the broadcast.
	SDValue V = V1;
	for (;;) {
	switch (V.getOpcode()) {
	case ISD::BITCAST: {
	// Peek through bitcasts as long as BroadcastIdx can be adjusted.
	SDValue VSrc = V.getOperand(0);
	unsigned NumEltBits = V.getScalarValueSizeInBits();
	unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
	if ((NumEltBits % NumSrcBits) == 0)
	BroadcastIdx *= (NumEltBits / NumSrcBits);
	else if ((NumSrcBits % NumEltBits) == 0 &&
	(BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
	BroadcastIdx /= (NumSrcBits / NumEltBits);
	else
	break;
	V = VSrc;
	continue;
	}
	case ISD::CONCAT_VECTORS: {
	int OperandSize = Mask.size() / V.getNumOperands();
	V = V.getOperand(BroadcastIdx / OperandSize);
	BroadcastIdx %= OperandSize;
	continue;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
	auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
	if (!ConstantIdx)
	break;

	int BeginIdx = (int)ConstantIdx->getZExtValue();
	int EndIdx =
	BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
	if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
	BroadcastIdx -= BeginIdx;
	V = VInner;
	} else {
	V = VOuter;
	}
	continue;
	}
	}
	break;
	}

	// Ensure the source vector and BroadcastIdx are for a suitable type.
	if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
	unsigned NumEltBits = VT.getScalarSizeInBits();
	unsigned NumSrcBits = V.getScalarValueSizeInBits();
	if ((NumSrcBits % NumEltBits) == 0)
	BroadcastIdx *= (NumSrcBits / NumEltBits);
	else if ((NumEltBits % NumSrcBits) == 0 &&
	(BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
	BroadcastIdx /= (NumEltBits / NumSrcBits);
	else
	return SDValue();

	unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
	MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
	V = DAG.getBitcast(SrcVT, V);
	}

	// Check if this is a broadcast of a scalar. We special case lowering
	// for scalars so that we can more effectively fold with loads.
	// First, look through bitcast: if the original value has a larger element
	// type than the shuffle, the broadcast element is in essence truncated.
	// Make that explicit to ease folding.
	if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
	if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
	DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
	return TruncBroadcast;

	MVT BroadcastVT = VT;

	// Peek through any bitcast (only useful for loads).
	SDValue BC = peekThroughBitcasts(V);

	// Also check the simpler case, where we can directly reuse the scalar.
	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
	V = V.getOperand(BroadcastIdx);

	// If we can't broadcast from a register, check that the input is a load.
	if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
	return SDValue();
	} else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
	BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
	Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: Opcode;
	}

	// If we are broadcasting a load that is only used by the shuffle
	// then we can reduce the vector load to the broadcasted scalar load.
	LoadSDNode *Ld = cast<LoadSDNode>(BC);
	SDValue BaseAddr = Ld->getOperand(1);
	EVT SVT = BroadcastVT.getScalarType();
	unsigned Offset = BroadcastIdx * SVT.getStoreSize();
	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
	V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
	DAG.getMachineFunction().getMachineMemOperand(
	Ld->getMemOperand(), Offset, SVT.getStoreSize()));
	DAG.makeEquivalentMemoryOrdering(Ld, V);
	} else if (!BroadcastFromReg) {
	// We can't broadcast from a vector register.
	return SDValue();
	} else if (BroadcastIdx != 0) {
	// We can only broadcast from the zero-element of a vector register,
	// but it can be advantageous to broadcast from the zero-element of a
	// subvector.
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();

	// Only broadcast the zero-element of a 128-bit subvector.
	unsigned EltSize = VT.getScalarSizeInBits();
	if (((BroadcastIdx * EltSize) % 128) != 0)
	return SDValue();

	// The shuffle input might have been a bitcast we looked through; look at
	// the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
	// later bitcast it to BroadcastVT.
	assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
	"Unexpected vector element size");
	assert((V.getValueSizeInBits() == 256 \|\| V.getValueSizeInBits() == 512) &&
	"Unexpected vector size");
	V = extract128BitVector(V, BroadcastIdx, DAG, DL);
	}

	if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	DAG.getBitcast(MVT::f64, V));

	// Bitcast back to the same scalar type as BroadcastVT.
	MVT SrcVT = V.getSimpleValueType();
	if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
	assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
	"Unexpected vector element size");
	if (SrcVT.isVector()) {
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
	} else {
	SrcVT = BroadcastVT.getScalarType();
	}
	V = DAG.getBitcast(SrcVT, V);
	}

	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
	V = DAG.getBitcast(MVT::f64, V);
	unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
	BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
	}

	// We only support broadcasting from 128-bit vectors to minimize the
	// number of patterns we need to deal with in isel. So extract down to
	// 128-bits, removing as many bitcasts as possible.
	if (SrcVT.getSizeInBits() > 128) {
	MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
	128 / SrcVT.getScalarSizeInBits());
	V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
	V = DAG.getBitcast(ExtVT, V);
	}

	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
	}

	// Check for whether we can use INSERTPS to perform the shuffle. We only use
	// INSERTPS when the V1 elements are already in the correct locations
	// because otherwise we can just always use two SHUFPS instructions which
	// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
	// perform INSERTPS if a single V1 element is out of place and all V2
	// elements are zeroable.
	static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
	unsigned &InsertPSMask,
	const APInt &Zeroable,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Attempt to match INSERTPS with one element from VA or VB being
	// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
	// are updated.
	auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
	ArrayRef<int> CandidateMask) {
	unsigned ZMask = 0;
	int VADstIndex = -1;
	int VBDstIndex = -1;
	bool VAUsedInPlace = false;

	for (int i = 0; i < 4; ++i) {
	// Synthesize a zero mask from the zeroable elements (includes undefs).
	if (Zeroable[i]) {
	ZMask \|= 1 << i;
	continue;
	}

	// Flag if we use any VA inputs in place.
	if (i == CandidateMask[i]) {
	VAUsedInPlace = true;
	continue;
	}

	// We can only insert a single non-zeroable element.
	if (VADstIndex >= 0 \|\| VBDstIndex >= 0)
	return false;

	if (CandidateMask[i] < 4) {
	// VA input out of place for insertion.
	VADstIndex = i;
	} else {
	// VB input for insertion.
	VBDstIndex = i;
	}
	}

	// Don't bother if we have no (non-zeroable) element for insertion.
	if (VADstIndex < 0 && VBDstIndex < 0)
	return false;

	// Determine element insertion src/dst indices. The src index is from the
	// start of the inserted vector, not the start of the concatenated vector.
	unsigned VBSrcIndex = 0;
	if (VADstIndex >= 0) {
	// If we have a VA input out of place, we use VA as the V2 element
	// insertion and don't use the original V2 at all.
	VBSrcIndex = CandidateMask[VADstIndex];
	VBDstIndex = VADstIndex;
	VB = VA;
	} else {
	VBSrcIndex = CandidateMask[VBDstIndex] - 4;
	}

	// If no V1 inputs are used in place, then the result is created only from
	// the zero mask and the V2 insertion - so remove V1 dependency.
	if (!VAUsedInPlace)
	VA = DAG.getUNDEF(MVT::v4f32);

	// Update V1, V2 and InsertPSMask accordingly.
	V1 = VA;
	V2 = VB;

	// Insert the V2 element into the desired position.
	InsertPSMask = VBSrcIndex << 6 \| VBDstIndex << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	return true;
	};

	if (matchAsInsertPS(V1, V2, Mask))
	return true;

	// Commute and try again.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);
	if (matchAsInsertPS(V2, V1, CommutedMask))
	return true;

	return false;
	}

	static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");

	// Attempt to match the insertps pattern.
	unsigned InsertPSMask;
	if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
	return SDValue();

	// Insert the V2 element into the desired position.
	return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	/// \brief Try to lower a shuffle as a permute of the inputs followed by an
	/// UNPCK instruction.
	///
	/// This specifically targets cases where we end up with alternating between
	/// the two inputs, and so can permute them into something that feeds a single
	/// UNPCK instruction. Note that this routine only targets integer vectors
	/// because for floating point vectors we have a generalized SHUFPS lowering
	/// strategy that handles everything that doesn't exactly match an unpack,
	/// making this clever lowering unnecessary.
	static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() &&
	"This routine only supports integer vectors.");
	assert(VT.is128BitVector() &&
	"This routine only works on 128-bit vectors.");
	assert(!V2.isUndef() &&
	"This routine should only be used when blending two inputs.");
	assert(Mask.size() >= 2 && "Single element masks are invalid.");

	int Size = Mask.size();

	int NumLoInputs =
	count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
	int NumHiInputs =
	count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

	bool UnpackLo = NumLoInputs >= NumHiInputs;

	auto TryUnpack = [&](int ScalarSize, int Scale) {
	SmallVector<int, 16> V1Mask((unsigned)Size, -1);
	SmallVector<int, 16> V2Mask((unsigned)Size, -1);

	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	// Each element of the unpack contains Scale elements from this mask.
	int UnpackIdx = i / Scale;

	// We only handle the case where V1 feeds the first slots of the unpack.
	// We rely on canonicalization to ensure this is the case.
	if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
	return SDValue();

	// Setup the mask for this input. The indexing is tricky as we have to
	// handle the unpack stride.
	SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
	VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
	Mask[i] % Size;
	}

	// If we will have to shuffle both inputs to use the unpack, check whether
	// we can just unpack first and shuffle the result. If so, skip this unpack.
	if ((NumLoInputs == 0 \|\| NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
	!isNoopShuffleMask(V2Mask))
	return SDValue();

	// Shuffle the inputs into place.
	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

	// Cast the inputs to the type we will use to unpack them.
	MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
	V1 = DAG.getBitcast(UnpackVT, V1);
	V2 = DAG.getBitcast(UnpackVT, V2);

	// Unpack the inputs and cast the result back to the desired type.
	return DAG.getBitcast(
	VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	UnpackVT, V1, V2));
	};

	// We try each unpack from the largest to the smallest to try and find one
	// that fits this mask.
	int OrigScalarSize = VT.getScalarSizeInBits();
	for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
	if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
	return Unpack;

	// If none of the unpack-rooted lowerings worked (or were profitable) try an
	// initial unpack.
	if (NumLoInputs == 0 \|\| NumHiInputs == 0) {
	assert((NumLoInputs > 0 \|\| NumHiInputs > 0) &&
	"We have to have some inputs!");
	int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

	// FIXME: We could consider the total complexity of the permute of each
	// possible unpacking. Or at the least we should consider how many
	// half-crossings are created.
	// FIXME: We could consider commuting the unpacks.

	SmallVector<int, 32> PermMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");

	PermMask[i] =
	2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
	}
	return DAG.getVectorShuffle(
	VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
	DL, VT, V1, V2),
	DAG.getUNDEF(VT), PermMask);
	}

	return SDValue();
	}

	/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
	///
	/// This is the basis function for the 2-lane 64-bit shuffles as we have full
	/// support for floating point shuffles but not integer shuffles. These
	/// instructions will incur a domain crossing penalty on some chips though so
	/// it is better to avoid lowering through this for integer vectors where
	/// possible.
	static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. Simulate this by using the
	// single input as both of the "inputs" to this instruction..
	unsigned SHUFPDMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1);

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	return DAG.getNode(
	X86ISD::SHUFP, DL, MVT::v2f64,
	Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}
	assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
	assert(Mask[1] >= 2 && "Non-canonicalized blend!");

	// If we have a single input, insert that into V1 if we can do so cheaply.
	if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
	Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;
	}

	// Try to use one of the special instruction patterns to handle two common
	// blend patterns if a zero-blend above didn't work.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {1, 3}))
	if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
	// We can either use a special instruction to load over the low double or
	// to move just the low double.
	return DAG.getNode(
	isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
	DL, MVT::v2f64, V2,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
	return V;

	unsigned SHUFPDMask = (Mask[0] == 1) \| (((Mask[1] - 2) == 1) << 1);
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
	///
	/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
	/// the integer unit to minimize domain crossing penalties. However, for blends
	/// it falls back to the floating point shuffle operation with appropriate bit
	/// casting.
	static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We have to map the mask as it is actually a v4i32 shuffle instruction.
	V1 = DAG.getBitcast(MVT::v4i32, V1);
	int WidenedMask[4] = {
	std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
	std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
	return DAG.getBitcast(
	MVT::v2i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
	}
	assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
	return Rotate;
	}

	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
	Mask, DAG);

	// We implement this with SHUFPD which is pretty lame because it will likely
	// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
	// However, all the alternatives are still more cycles and newer chips don't
	// have this problem. It would be really nice if x86 had better shuffles here.
	V1 = DAG.getBitcast(MVT::v2f64, V1);
	V2 = DAG.getBitcast(MVT::v2f64, V2);
	return DAG.getBitcast(MVT::v2i64,
	DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
	}

	/// \brief Test whether this can be lowered with a single SHUFPS instruction.
	///
	/// This is used to disable more specialized lowerings when the shufps lowering
	/// will happen to be efficient.
	static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
	// This routine only handles 128-bit shufps.
	assert(Mask.size() == 4 && "Unsupported mask size!");
	assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");

	// To lower with a single SHUFPS we need to have the low half and high half
	// each requiring a single input.
	if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
	return false;
	if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
	return false;

	return true;
	}

	/// \brief Lower a vector shuffle using the SHUFPS instruction.
	///
	/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
	/// It makes no assumptions about whether this is the best lowering, it simply
	/// uses it.
	static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SDValue LowV = V1, HighV = V2;
	int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 1) {
	int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

	// Compute the index adjacent to V2Index and in the same half by toggling
	// the low bit.
	int V2AdjIndex = V2Index ^ 1;

	if (Mask[V2AdjIndex] < 0) {
	// Handles all the cases where we have a single V2 element and an undef.
	// This will only ever happen in the high lanes because we commute the
	// vector otherwise.
	if (V2Index < 2)
	std::swap(LowV, HighV);
	NewMask[V2Index] -= 4;
	} else {
	// Handle the case where the V2 element ends up adjacent to a V1 element.
	// To make this work, blend them together as the first step.
	int V1Index = V2AdjIndex;
	int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
	V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now proceed to reconstruct the final blend as we have the necessary
	// high or low half formed.
	if (V2Index < 2) {
	LowV = V2;
	HighV = V1;
	} else {
	HighV = V2;
	}
	NewMask[V1Index] = 2; // We put the V1 element in V2[2].
	NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
	}
	} else if (NumV2Elements == 2) {
	if (Mask[0] < 4 && Mask[1] < 4) {
	// Handle the easy case where we have V1 in the low lanes and V2 in the
	// high lanes.
	NewMask[2] -= 4;
	NewMask[3] -= 4;
	} else if (Mask[2] < 4 && Mask[3] < 4) {
	// We also handle the reversed case because this utility may get called
	// when we detect a SHUFPS pattern but can't easily commute the shuffle to
	// arrange things in the right direction.
	NewMask[0] -= 4;
	NewMask[1] -= 4;
	HighV = V1;
	LowV = V2;
	} else {
	// We have a mixture of V1 and V2 in both low and high lanes. Rather than
	// trying to place elements directly, just blend them and set up the final
	// shuffle to place them.

	// The first two blend mask elements are for V1, the second two are for
	// V2.
	int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
	Mask[2] < 4 ? Mask[2] : Mask[3],
	(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
	(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
	V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now we do a normal shuffle of V1 by giving V1 as both operands to
	// a blend.
	LowV = HighV = V1;
	NewMask[0] = Mask[0] < 4 ? 0 : 2;
	NewMask[1] = Mask[0] < 4 ? 2 : 0;
	NewMask[2] = Mask[2] < 4 ? 1 : 3;
	NewMask[3] = Mask[2] < 4 ? 3 : 1;
	}
	}
	return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
	getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
	}

	/// \brief Lower 4-lane 32-bit floating point shuffles.
	///
	/// Uses instructions exclusively from the floating point unit to minimize
	/// domain crossing penalties, as these are sufficient to implement all v4f32
	/// shuffles.
	static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (Subtarget.hasSSE3()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
	}

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
	// in SSE1 because otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
	}

	// Otherwise, use a straight shuffle of a single input vector. We pass the
	// input vector to both operands to simulate this with a SHUFPS.
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// There are special ways we can lower some single-element blends. However, we
	// have custom ways we can lower more complex single-element blends below that
	// we defer to if both this and BLENDPS fail to match, so restrict this to
	// when the V2 input is targeting element 0 of the mask -- that is the fast
	// case here.
	if (NumV2Elements == 1 && Mask[0] >= 4)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (Subtarget.hasSSE41()) {
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use INSERTPS if we can complete the shuffle efficiently.
	if (SDValue V =
	lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
	return V;

	if (!isSingleSHUFPSMask(Mask))
	if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
	DL, MVT::v4f32, V1, V2, Mask, DAG))
	return BlendPerm;
	}

	// Use low/high mov instructions. These are only valid in SSE1 because
	// otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise fall back to a SHUFPS lowering strategy.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
	}

	/// \brief Lower 4-lane i32 vector shuffles.
	///
	/// We try to handle these with integer-domain shuffles where we can, but for
	/// blends we use the floating point domain blend instructions.
	static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We coerce the shuffle pattern to be compatible with UNPCK instructions
	// but we aren't actually going to use the UNPCK instruction because doing
	// so prevents folding a load into this instruction or making a copy.
	const int UnpackLoMask[] = {0, 0, 1, 1};
	const int UnpackHiMask[] = {2, 2, 3, 3};
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
	Mask = UnpackLoMask;
	else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
	Mask = UnpackHiMask;

	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (!isSingleSHUFPSMask(Mask)) {
	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
	Mask, DAG);

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
	DL, MVT::v4i32, V1, V2, Mask, DAG))
	return Unpack;
	}

	// We implement this with SHUFPS because it can blend from two vectors.
	// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
	// up the inputs, bypassing domain shift penalties that we would incur if we
	// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
	// relevant.
	SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
	SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
	return DAG.getBitcast(MVT::v4i32, ShufPS);
	}

	/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
	/// shuffle lowering, and the most complex part.
	///
	/// The lowering strategy is to try to form pairs of input lanes which are
	/// targeted at the same half of the final vector, and then use a dword shuffle
	/// to place them onto the right half, and finally unpack the paired lanes into
	/// their final position.
	///
	/// The exact breakdown of how to form these dword pairs and align them on the
	/// correct sides is really tricky. See the comments within the function for
	/// more of the details.
	///
	/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
	/// lane must shuffle the exact same way. In fact, you must pass a v8 Mask to
	/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
	/// vector, form the analogous 128-bit 8-element Mask.
	static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
	const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
	MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

	assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
	MutableArrayRef<int> LoMask = Mask.slice(0, 4);
	MutableArrayRef<int> HiMask = Mask.slice(4, 4);

	// Attempt to directly match PSHUFLW or PSHUFHW.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
	}
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	for (int i = 0; i != 4; ++i)
	HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
	return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
	}

	SmallVector<int, 4> LoInputs;
	copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
	std::sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
	std::sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
	int NumLToL =
	std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
	int NumHToL = LoInputs.size() - NumLToL;
	int NumLToH =
	std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
	int NumHToH = HiInputs.size() - NumLToH;
	MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
	MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
	MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
	MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

	// If we are shuffling values from one half - check how many different DWORD
	// pairs we need to create. If only 1 or 2 then we can perform this as a
	// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
	auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
	ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
	V = DAG.getNode(ShufWOp, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
	V = DAG.getBitcast(PSHUFDVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	};

	if ((NumHToL + NumHToH) == 0 \|\| (NumLToL + NumLToH) == 0) {
	int PSHUFDMask[4] = { -1, -1, -1, -1 };
	SmallVector<std::pair<int, int>, 4> DWordPairs;
	int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

	// Collect the different DWORD pairs.
	for (int DWord = 0; DWord != 4; ++DWord) {
	int M0 = Mask[2 * DWord + 0];
	int M1 = Mask[2 * DWord + 1];
	M0 = (M0 >= 0 ? M0 % 4 : M0);
	M1 = (M1 >= 0 ? M1 % 4 : M1);
	if (M0 < 0 && M1 < 0)
	continue;

	bool Match = false;
	for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
	auto &DWordPair = DWordPairs[j];
	if ((M0 < 0 \|\| isUndefOrEqual(DWordPair.first, M0)) &&
	(M1 < 0 \|\| isUndefOrEqual(DWordPair.second, M1))) {
	DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
	DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
	PSHUFDMask[DWord] = DOffset + j;
	Match = true;
	break;
	}
	}
	if (!Match) {
	PSHUFDMask[DWord] = DOffset + DWordPairs.size();
	DWordPairs.push_back(std::make_pair(M0, M1));
	}
	}

	if (DWordPairs.size() <= 2) {
	DWordPairs.resize(2, std::make_pair(-1, -1));
	int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
	DWordPairs[1].first, DWordPairs[1].second};
	if ((NumHToL + NumHToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
	if ((NumLToL + NumLToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
	}
	}

	// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
	// such inputs we can swap two of the dwords across the half mark and end up
	// with <=2 inputs to each half in each half. Once there, we can fall through
	// to the generic code below. For example:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
	//
	// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
	// and an existing 2-into-2 on the other half. In this case we may have to
	// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
	// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
	// Fortunately, we don't have to handle anything but a 2-into-2 pattern
	// because any other situation (including a 3-into-1 or 1-into-3 in the other
	// half than the one we target for fixing) will be fixed when we re-enter this
	// path. We will also combine away any sequence of PSHUFD instructions that
	// result into a single instruction. Here is an example of the tricky case:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
	//
	// This now has a 1-into-3 in the high half! Instead, we do two shuffles:
	//
	// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
	//
	// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
	//
	// The result is fine to be handled by the generic logic.
	auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
	ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
	int AOffset, int BOffset) {
	assert((AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) &&
	"Must call this with A having 3 or 1 inputs from the A half.");
	assert((BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) &&
	"Must call this with B having 1 or 3 inputs from the B half.");
	assert(AToAInputs.size() + BToAInputs.size() == 4 &&
	"Must call this with either 3:1 or 1:3 inputs (summing to 4).");

	bool ThreeAInputs = AToAInputs.size() == 3;

	// Compute the index of dword with only one word among the three inputs in
	// a half by taking the sum of the half with three inputs and subtracting
	// the sum of the actual three inputs. The difference is the remaining
	// slot.
	int ADWord, BDWord;
	int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
	int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
	int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
	ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
	int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
	int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
	int TripleNonInputIdx =
	TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
	TripleDWord = TripleNonInputIdx / 2;

	// We use xor with one to compute the adjacent DWord to whichever one the
	// OneInput is in.
	OneInputDWord = (OneInput / 2) ^ 1;

	// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
	// and BToA inputs. If there is also such a problem with the BToB and AToB
	// inputs, we don't try to fix it necessarily -- we'll recurse and see it in
	// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
	// is essential that we don't create a 3<-1 as then we might oscillate.
	if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
	// Compute how many inputs will be flipped by swapping these DWords. We
	// need
	// to balance this to ensure we don't form a 3-1 shuffle in the other
	// half.
	int NumFlippedAToBInputs =
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
	int NumFlippedBToBInputs =
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
	if ((NumFlippedAToBInputs == 1 &&
	(NumFlippedBToBInputs == 0 \|\| NumFlippedBToBInputs == 2)) \|\|
	(NumFlippedBToBInputs == 1 &&
	(NumFlippedAToBInputs == 0 \|\| NumFlippedAToBInputs == 2))) {
	// We choose whether to fix the A half or B half based on whether that
	// half has zero flipped inputs. At zero, we may not be able to fix it
	// with that half. We also bias towards fixing the B half because that
	// will more commonly be the high half, and we have to bias one way.
	auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
	ArrayRef<int> Inputs) {
	int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
	bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
	// Determine whether the free index is in the flipped dword or the
	// unflipped dword based on where the pinned index is. We use this bit
	// in an xor to conditionally select the adjacent dword.
	int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
	bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	if (IsFixIdxInput == IsFixFreeIdxInput)
	FixFreeIdx += 1;
	IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	assert(IsFixIdxInput != IsFixFreeIdxInput &&
	"We need to be changing the number of flipped inputs!");
	int PSHUFHalfMask[] = {0, 1, 2, 3};
	std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
	V = DAG.getNode(
	FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
	MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

	for (int &M : Mask)
	if (M >= 0 && M == FixIdx)
	M = FixFreeIdx;
	else if (M >= 0 && M == FixFreeIdx)
	M = FixIdx;
	};
	if (NumFlippedBToBInputs != 0) {
	int BPinnedIdx =
	BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
	} else {
	assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
	int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
	}
	}
	}

	int PSHUFDMask[] = {0, 1, 2, 3};
	PSHUFDMask[ADWord] = BDWord;
	PSHUFDMask[BDWord] = ADWord;
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// Adjust the mask to match the new locations of A and B.
	for (int &M : Mask)
	if (M >= 0 && M/2 == ADWord)
	M = 2 * BDWord + M % 2;
	else if (M >= 0 && M/2 == BDWord)
	M = 2 * ADWord + M % 2;

	// Recurse back into this routine to re-compute state now that this isn't
	// a 3 and 1 problem.
	return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
	DAG);
	};
	if ((NumLToL == 3 && NumHToL == 1) \|\| (NumLToL == 1 && NumHToL == 3))
	return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
	if ((NumHToH == 3 && NumLToH == 1) \|\| (NumHToH == 1 && NumLToH == 3))
	return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

	// At this point there are at most two inputs to the low and high halves from
	// each half. That means the inputs can always be grouped into dwords and
	// those dwords can then be moved to the correct half with a dword shuffle.
	// We use at most one low and one high word shuffle to collect these paired
	// inputs into dwords, and finally a dword shuffle to place them.
	int PSHUFLMask[4] = {-1, -1, -1, -1};
	int PSHUFHMask[4] = {-1, -1, -1, -1};
	int PSHUFDMask[4] = {-1, -1, -1, -1};

	// First fix the masks for all the inputs that are staying in their
	// original halves. This will then dictate the targets of the cross-half
	// shuffles.
	auto fixInPlaceInputs =
	[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
	MutableArrayRef<int> SourceHalfMask,
	MutableArrayRef<int> HalfMask, int HalfOffset) {
	if (InPlaceInputs.empty())
	return;
	if (InPlaceInputs.size() == 1) {
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
	return;
	}
	if (IncomingInputs.empty()) {
	// Just fix all of the in place inputs.
	for (int Input : InPlaceInputs) {
	SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
	PSHUFDMask[Input / 2] = Input / 2;
	}
	return;
	}

	assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	// Put the second input next to the first so that they are packed into
	// a dword. We find the adjacent index by toggling the low bit.
	int AdjIndex = InPlaceInputs[0] ^ 1;
	SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
	PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
	};
	fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
	fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

	// Now gather the cross-half inputs and place them into a free dword of
	// their target half.
	// FIXME: This operation could almost certainly be simplified dramatically to
	// look more like the 3-1 fixing operation.
	auto moveInputsToRightHalf = [&PSHUFDMask](
	MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
	MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
	MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
	int DestOffset) {
	auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
	return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
	};
	auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
	int Word) {
	int LowWord = Word & ~1;
	int HighWord = Word \| 1;
	return isWordClobbered(SourceHalfMask, LowWord) \|\|
	isWordClobbered(SourceHalfMask, HighWord);
	};

	if (IncomingInputs.empty())
	return;

	if (ExistingInputs.empty()) {
	// Map any dwords with inputs from them into the right half.
	for (int Input : IncomingInputs) {
	// If the source half mask maps over the inputs, turn those into
	// swaps and use the swapped lane.
	if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
	if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
	SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
	Input - SourceOffset;
	// We have to swap the uses in our half mask in one sweep.
	for (int &M : HalfMask)
	if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
	M = Input;
	else if (M == Input)
	M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	} else {
	assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
	Input - SourceOffset &&
	"Previous placement doesn't match!");
	}
	// Note that this correctly re-maps both when we do a swap and when
	// we observe the other side of the swap above. We rely on that to
	// avoid swapping the members of the input list directly.
	Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	}

	// Map the input's dword into the correct half.
	if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
	PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
	else
	assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
	Input / 2 &&
	"Previous placement doesn't match!");
	}

	// And just directly shift any other-half mask elements to be same-half
	// as we will have mirrored the dword containing the element into the
	// same position within that half.
	for (int &M : HalfMask)
	if (M >= SourceOffset && M < SourceOffset + 4) {
	M = M - SourceOffset + DestOffset;
	assert(M >= 0 && "This should never wrap below zero!");
	}
	return;
	}

	// Ensure we have the input in a viable dword of its current half. This
	// is particularly tricky because the original position may be clobbered
	// by inputs being moved and staying in that half.
	if (IncomingInputs.size() == 1) {
	if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
	SourceOffset;
	SourceHalfMask[InputFixed - SourceOffset] =
	IncomingInputs[0] - SourceOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
	InputFixed);
	IncomingInputs[0] = InputFixed;
	}
	} else if (IncomingInputs.size() == 2) {
	if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 \|\|
	isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	// We have two non-adjacent or clobbered inputs we need to extract from
	// the source half. To do this, we need to map them into some adjacent
	// dword slot in the source mask.
	int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
	IncomingInputs[1] - SourceOffset};

	// If there is a free slot in the source half mask adjacent to one of
	// the inputs, place the other input in it. We use (Index XOR 1) to
	// compute an adjacent index.
	if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
	SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	InputsFixed[1] = InputsFixed[0] ^ 1;
	} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
	SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
	InputsFixed[0] = InputsFixed[1] ^ 1;
	} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
	// The two inputs are in the same DWord but it is clobbered and the
	// adjacent DWord isn't used at all. Move both inputs to the free
	// slot.
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
	InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
	InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
	} else {
	// The only way we hit this point is if there is no clobbering
	// (because there are no off-half inputs to this half) and there is no
	// free slot adjacent to one of the inputs. In this case, we have to
	// swap an input with a non-input.
	for (int i = 0; i < 4; ++i)
	assert((SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) &&
	"We can't handle any clobbers here!");
	assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
	"Cannot have adjacent inputs here!");

	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

	// We also have to update the final source mask in this case because
	// it may need to undo the above swap.
	for (int &M : FinalSourceHalfMask)
	if (M == (InputsFixed[0] ^ 1) + SourceOffset)
	M = InputsFixed[1] + SourceOffset;
	else if (M == InputsFixed[1] + SourceOffset)
	M = (InputsFixed[0] ^ 1) + SourceOffset;

	InputsFixed[1] = InputsFixed[0] ^ 1;
	}

	// Point everything at the fixed inputs.
	for (int &M : HalfMask)
	if (M == IncomingInputs[0])
	M = InputsFixed[0] + SourceOffset;
	else if (M == IncomingInputs[1])
	M = InputsFixed[1] + SourceOffset;

	IncomingInputs[0] = InputsFixed[0] + SourceOffset;
	IncomingInputs[1] = InputsFixed[1] + SourceOffset;
	}
	} else {
	llvm_unreachable("Unhandled input size!");
	}

	// Now hoist the DWord down to the right half.
	int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
	assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
	PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
	for (int &M : HalfMask)
	for (int Input : IncomingInputs)
	if (M == Input)
	M = FreeDWord * 2 + Input % 2;
	};
	moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
	/SourceOffset/ 4, /DestOffset/ 0);
	moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
	/SourceOffset/ 0, /DestOffset/ 4);

	// Now enact all the shuffles we've computed to move the inputs into their
	// target half.
	if (!isNoopShuffleMask(PSHUFLMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFHMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFDMask))
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// At this point, each half should contain all its inputs, and we can then
	// just shuffle them into their final position.
	assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
	"Failed to lift all the high half inputs to the low mask!");
	assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
	"Failed to lift all the low half inputs to the high mask!");

	// Do a half shuffle for the low mask.
	if (!isNoopShuffleMask(LoMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

	// Do a half shuffle with the high mask after shifting its values down.
	for (int &M : HiMask)
	if (M >= 0)
	M -= 4;
	if (!isNoopShuffleMask(HiMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

	return V;
	}

	/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
	/// blend if only one input is used.
	static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
	bool &V2InUse) {
	SDValue V1Mask[16];
	SDValue V2Mask[16];
	V1InUse = false;
	V2InUse = false;

	int Size = Mask.size();
	int Scale = 16 / Size;
	for (int i = 0; i < 16; ++i) {
	if (Mask[i / Scale] < 0) {
	V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
	} else {
	const int ZeroMask = 0x80;
	int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
	: ZeroMask;
	int V2Idx = Mask[i / Scale] < Size
	? ZeroMask
	: (Mask[i / Scale] - Size) * Scale + i % Scale;
	if (Zeroable[i / Scale])
	V1Idx = V2Idx = ZeroMask;
	V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
	V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
	V1InUse \|= (ZeroMask != V1Idx);
	V2InUse \|= (ZeroMask != V2Idx);
	}
	}

	if (V1InUse)
	V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
	DAG.getBitcast(MVT::v16i8, V1),
	DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
	if (V2InUse)
	V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
	DAG.getBitcast(MVT::v16i8, V2),
	DAG.getBuildVector(MVT::v16i8, DL, V2Mask));

	// If we need shuffled inputs from both, blend the two.
	SDValue V;
	if (V1InUse && V2InUse)
	V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
	else
	V = V1InUse ? V1 : V2;

	// Cast the result back to the correct type.
	return DAG.getBitcast(VT, V);
	}

	/// \brief Generic lowering of 8-lane i16 shuffles.
	///
	/// This handles both single-input shuffles and combined shuffle/blends with
	/// two inputs. The single input shuffles are immediately delegated to
	/// a dedicated lowering routine.
	///
	/// The blends are lowered in one of three fundamental ways. If there are few
	/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
	/// of the input is significantly cheaper when lowered as an interleaving of
	/// the two inputs, try to interleave them. Otherwise, blend the low and high
	/// halves of the inputs separately (making them have relatively few inputs)
	/// and then concatenate them.
	static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

	if (NumV2Inputs == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
	Mask, Subtarget, DAG))
	return Rotate;

	// Make a copy of the mask so it can be modified.
	SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
	return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
	MutableMask, Subtarget,
	DAG);
	}

	assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
	"All single-input shuffles should be canonicalized to be V1-input "
	"shuffles.");

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	// There are special ways we can lower some single-element blends.
	if (NumV2Inputs == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue BitBlend =
	lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
	return BitBlend;

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
	V2, Mask, DAG))
	return Unpack;

	// If we can't directly blend but can use PSHUFB, that will be better as it
	// can both shuffle and set up the inefficient blend.
	if (!IsBlendSupported && Subtarget.hasSSSE3()) {
	bool V1InUse, V2InUse;
	return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG, V1InUse, V2InUse);
	}

	// We can always bit-blend if we have to so the fallback strategy is to
	// decompose into single-input permutes and blends.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
	Mask, DAG);
	}

	/// \brief Check whether a compaction lowering can be done by dropping even
	/// elements and compute how many times even elements must be dropped.
	///
	/// This handles shuffles which take every Nth element where N is a power of
	/// two. Example shuffle masks:
	///
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
	/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
	/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
	/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
	/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
	///
	/// Any of these lanes can of course be undef.
	///
	/// This routine only supports N <= 3.
	/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
	/// for larger N.
	///
	/// \returns N above, or the number of times even elements must be dropped if
	/// there is such a number. Otherwise returns zero.
	static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
	bool IsSingleInput) {
	// The modulus for the shuffle vector entries is based on whether this is
	// a single input or not.
	int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
	assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
	"We should only be called with masks with a power-of-2 size!");

	uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

	// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
	// and 2^3 simultaneously. This is because we may have ambiguity with
	// partially undef inputs.
	bool ViableForN[3] = {true, true, true};

	for (int i = 0, e = Mask.size(); i < e; ++i) {
	// Ignore undef lanes, we'll optimistically collapse them to the pattern we
	// want.
	if (Mask[i] < 0)
	continue;

	bool IsAnyViable = false;
	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j]) {
	uint64_t N = j + 1;

	// The shuffle mask must be equal to (i * 2^N) % M.
	if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
	IsAnyViable = true;
	else
	ViableForN[j] = false;
	}
	// Early exit if we exhaust the possible powers of two.
	if (!IsAnyViable)
	break;
	}

	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j])
	return j + 1;

	// Return 0 as there is no viable power of two.
	return 0;
	}

	static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
	MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());

	SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);

	return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
	}

	/// \brief Generic lowering of v16i8 shuffles.
	///
	/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
	/// detect any complexity reducing interleaving. If that doesn't help, it uses
	/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
	/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
	/// back together.
	static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use a zext lowering.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

	// For single-input shuffles, there are some nicer lowering tricks we can use.
	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Check whether we can widen this to an i16 shuffle by duplicating bytes.
	// Notably, this handles splat and partial-splat shuffles more efficiently.
	// However, it only makes sense if the pre-duplication shuffle simplifies
	// things significantly. Currently, this means we need to be able to
	// express the pre-duplication shuffle as an i16 shuffle.
	//
	// FIXME: We should check for other patterns which can be widened into an
	// i16 shuffle as well.
	auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
	for (int i = 0; i < 16; i += 2)
	if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
	return false;

	return true;
	};
	auto tryToWidenViaDuplication = [&]() -> SDValue {
	if (!canWidenViaDuplication(Mask))
	return SDValue();
	SmallVector<int, 4> LoInputs;
	copy_if(Mask, std::back_inserter(LoInputs),
	[](int M) { return M >= 0 && M < 8; });
	std::sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
	LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
	std::sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
	HiInputs.end());

	bool TargetLo = LoInputs.size() >= HiInputs.size();
	ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
	ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

	int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
	SmallDenseMap<int, int, 8> LaneMap;
	for (int I : InPlaceInputs) {
	PreDupI16Shuffle[I/2] = I/2;
	LaneMap[I] = I;
	}
	int j = TargetLo ? 0 : 4, je = j + 4;
	for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
	// Check if j is already a shuffle of this input. This happens when
	// there are two adjacent bytes after we move the low one.
	if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
	// If we haven't yet mapped the input, search for a slot into which
	// we can map it.
	while (j < je && PreDupI16Shuffle[j] >= 0)
	++j;

	if (j == je)
	// We can't place the inputs into a single half with a simple i16 shuffle, so bail.
	return SDValue();

	// Map this input with the i16 shuffle.
	PreDupI16Shuffle[j] = MovingInputs[i] / 2;
	}

	// Update the lane map based on the mapping we ended up with.
	LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
	}
	V1 = DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

	// Unpack the bytes to form the i16s that will be shuffled into place.
	V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	MVT::v16i8, V1, V1);

	int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0) {
	int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
	assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
	if (PostDupI16Shuffle[i / 2] < 0)
	PostDupI16Shuffle[i / 2] = MappedMask;
	else
	assert(PostDupI16Shuffle[i / 2] == MappedMask &&
	"Conflicting entries in the original shuffle!");
	}
	return DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
	};
	if (SDValue V = tryToWidenViaDuplication())
	return V;
	}

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
	// with PSHUFB. It is important to do this before we attempt to generate any
	// blends but after all of the single-input lowerings. If the single input
	// lowerings can find an instruction sequence that is faster than a PSHUFB, we
	// want to preserve that and we can DAG combine any longer sequences into
	// a PSHUFB in the end. But once we start blending from multiple inputs,
	// the complexity of DAG combining bad patterns back into PSHUFB is too high,
	// and there are very few patterns that would actually be faster than the
	// PSHUFB approach because of its ability to zero lanes.
	//
	// FIXME: The only exceptions to the above are blends which are exact
	// interleavings with direct instructions supporting them. We currently don't
	// handle those well here.
	if (Subtarget.hasSSSE3()) {
	bool V1InUse = false;
	bool V2InUse = false;

	SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

	// If both V1 and V2 are in use and we can use a direct blend or an unpack,
	// do so. This avoids using them to handle blends-with-zero which is
	// important as a single pshufb is significantly faster for that.
	if (V1InUse && V2InUse) {
	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerVectorShuffleAsBlend(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Blend;

	// We can use an unpack to do the blending rather than an or in some
	// cases. Even though the or may be (very minorly) more efficient, we
	// preference this lowering because there are common cases where part of
	// the complexity of the shuffles goes away when we do the final blend as
	// an unpack.
	// FIXME: It might be worth trying to detect if the unpack-feeding
	// shuffles will both be pshufb, in which case we shouldn't bother with
	// this.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
	DL, MVT::v16i8, V1, V2, Mask, DAG))
	return Unpack;

	// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
	}

	return PSHUFB;
	}

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (SDValue BitBlend =
	lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
	return BitBlend;

	// Check whether a compaction lowering can be done. This handles shuffles
	// which take every Nth element for some even N. See the helper function for
	// details.
	//
	// We special case these as they can be particularly efficiently handled with
	// the PACKUSB instruction on x86 and they show up in common patterns of
	// rearranging bytes to truncate wide elements.
	bool IsSingleInput = V2.isUndef();
	if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
	// NumEvenDrops is the power of two stride of the elements. Another way of
	// thinking about it is that we need to drop the even elements this many
	// times to get the original input.

	// First we need to zero all the dropped bytes.
	assert(NumEvenDrops <= 3 &&
	"No support for dropping even elements more than 3 times.");
	// We use the mask type to pick which bytes are preserved based on how many
	// elements are dropped.
	MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
	SDValue ByteClearMask = DAG.getBitcast(
	MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
	V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
	if (!IsSingleInput)
	V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);

	// Now pack things back together.
	V1 = DAG.getBitcast(MVT::v8i16, V1);
	V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
	SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
	for (int i = 1; i < NumEvenDrops; ++i) {
	Result = DAG.getBitcast(MVT::v8i16, Result);
	Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
	}

	return Result;
	}

	// Handle multi-input cases by blending single-input shuffles.
	if (NumV2Elements > 0)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
	Mask, DAG);

	// The fallback path for single-input shuffles widens this into two v8i16
	// vectors with unpacks, shuffles those, and then pulls them back together
	// with a pack.
	SDValue V = V1;

	std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0)
	(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

	SDValue VLoHalf, VHiHalf;
	// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
	// them out and avoid using UNPCK{L,H} to extract the elements of V as
	// i16s.
	if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
	none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
	// Use a mask to drop the high bytes.
	VLoHalf = DAG.getBitcast(MVT::v8i16, V);
	VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
	DAG.getConstant(0x00FF, DL, MVT::v8i16));

	// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
	VHiHalf = DAG.getUNDEF(MVT::v8i16);

	// Squash the masks to point directly into VLoHalf.
	for (int &M : LoBlendMask)
	if (M >= 0)
	M /= 2;
	for (int &M : HiBlendMask)
	if (M >= 0)
	M /= 2;
	} else {
	// Otherwise just unpack the low half of V into VLoHalf and the high half into
	// VHiHalf so that we can blend them as i16s.
	SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

	VLoHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
	VHiHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
	}

	SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
	SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

	return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
	}

	/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
	///
	/// This routine breaks down the specific type of 128-bit shuffle and
	/// dispatches to the lowering routines accordingly.
	static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	switch (VT.SimpleTy) {
	case MVT::v2i64:
	return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v2f64:
	return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i32:
	return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4f32:
	return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i16:
	return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i8:
	return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Unimplemented!");
	}
	}

	/// \brief Generic routine to split vector shuffle into half-sized shuffles.
	///
	/// This routine just extracts two subvectors, shuffles them independently, and
	/// then concatenates them back together. This should work effectively with all
	/// AVX vector shuffle types.
	static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.getSizeInBits() >= 256 &&
	"Only for 256-bit or wider vector shuffles!");
	assert(V1.getSimpleValueType() == VT && "Bad operand type!");
	assert(V2.getSimpleValueType() == VT && "Bad operand type!");

	ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
	ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

	int NumElements = VT.getVectorNumElements();
	int SplitNumElements = NumElements / 2;
	MVT ScalarVT = VT.getVectorElementType();
	MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);

	// Rather than splitting build-vectors, just build two narrower build
	// vectors. This helps shuffling with splats and zeros.
	auto SplitVector = [&](SDValue V) {
	V = peekThroughBitcasts(V);

	MVT OrigVT = V.getSimpleValueType();
	int OrigNumElements = OrigVT.getVectorNumElements();
	int OrigSplitNumElements = OrigNumElements / 2;
	MVT OrigScalarVT = OrigVT.getVectorElementType();
	MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);

	SDValue LoV, HiV;

	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV) {
	LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(0, DL));
	HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(OrigSplitNumElements, DL));
	} else {

	SmallVector<SDValue, 16> LoOps, HiOps;
	for (int i = 0; i < OrigSplitNumElements; ++i) {
	LoOps.push_back(BV->getOperand(i));
	HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
	}
	LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
	HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
	}
	return std::make_pair(DAG.getBitcast(SplitVT, LoV),
	DAG.getBitcast(SplitVT, HiV));
	};

	SDValue LoV1, HiV1, LoV2, HiV2;
	std::tie(LoV1, HiV1) = SplitVector(V1);
	std::tie(LoV2, HiV2) = SplitVector(V2);

	// Now create two 4-way blends of these half-width vectors.
	auto HalfBlend = [&](ArrayRef<int> HalfMask) {
	bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
	SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
	for (int i = 0; i < SplitNumElements; ++i) {
	int M = HalfMask[i];
	if (M >= NumElements) {
	if (M >= NumElements + SplitNumElements)
	UseHiV2 = true;
	else
	UseLoV2 = true;
	V2BlendMask[i] = M - NumElements;
	BlendMask[i] = SplitNumElements + i;
	} else if (M >= 0) {
	if (M >= SplitNumElements)
	UseHiV1 = true;
	else
	UseLoV1 = true;
	V1BlendMask[i] = M;
	BlendMask[i] = i;
	}
	}

	// Because the lowering happens after all combining takes place, we need to
	// manually combine these blend masks as much as possible so that we create
	// a minimal number of high-level vector shuffle nodes.

	// First try just blending the halves of V1 or V2.
	if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
	return DAG.getUNDEF(SplitVT);
	if (!UseLoV2 && !UseHiV2)
	return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	if (!UseLoV1 && !UseHiV1)
	return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

	SDValue V1Blend, V2Blend;
	if (UseLoV1 && UseHiV1) {
	V1Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	} else {
	// We only use half of V1 so map the usage down into the final blend mask.
	V1Blend = UseLoV1 ? LoV1 : HiV1;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
	BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
	}
	if (UseLoV2 && UseHiV2) {
	V2Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
	} else {
	// We only use half of V2 so map the usage down into the final blend mask.
	V2Blend = UseLoV2 ? LoV2 : HiV2;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= SplitNumElements)
	BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
	}
	return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
	};
	SDValue Lo = HalfBlend(LoMask);
	SDValue Hi = HalfBlend(HiMask);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	/// \brief Either split a vector in halves or decompose the shuffles and the
	/// blend.
	///
	/// This is provided as a good fallback for many lowerings of non-single-input
	/// shuffles with more than one 128-bit lane. In those cases, we want to select
	/// between splitting the shuffle into 128-bit components and stitching those
	/// back together vs. extracting the single-input shuffles and blending those
	/// results.
	static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This routine must not be used to lower single-input "
	"shuffles as it could then recurse on itself.");
	int Size = Mask.size();

	// If this can be modeled as a broadcast of two elements followed by a blend,
	// prefer that lowering. This is especially important because broadcasts can
	// often fold with memory operands.
	auto DoBothBroadcast = [&] {
	int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
	for (int M : Mask)
	if (M >= Size) {
	if (V2BroadcastIdx < 0)
	V2BroadcastIdx = M - Size;
	else if (M - Size != V2BroadcastIdx)
	return false;
	} else if (M >= 0) {
	if (V1BroadcastIdx < 0)
	V1BroadcastIdx = M;
	else if (M != V1BroadcastIdx)
	return false;
	}
	return true;
	};
	if (DoBothBroadcast())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
	DAG);

	// If the inputs all stem from a single 128-bit lane of each input, then we
	// split them rather than blending because the split will decompose to
	// unusually few instructions.
	int LaneCount = VT.getSizeInBits() / 128;
	int LaneSize = Size / LaneCount;
	SmallBitVector LaneInputs[2];
	LaneInputs[0].resize(LaneCount, false);
	LaneInputs[1].resize(LaneCount, false);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
	if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

	// Otherwise, just fall back to decomposed shuffles and a blend. This requires
	// that the decomposed single-input shuffles don't end up here.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
	}

	/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
	/// a permutation and blend of those lanes.
	///
	/// This essentially blends the out-of-lane inputs to each lane into the lane
	/// from a permuted copy of the vector. This lowering strategy results in four
	/// instructions in the worst case for a single-input cross lane shuffle which
	/// is lower than any other fully general cross-lane shuffle strategy I'm aware
	/// of. Special cases for each particular shuffle pattern should be handled
	/// prior to trying this lowering.
	static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FIXME: This should probably be generalized for 512-bit vectors as well.
	assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
	int Size = Mask.size();
	int LaneSize = Size / 2;

	// If there are only inputs from one 128-bit lane, splitting will in fact be
	// less expensive. The flags track whether the given lane contains an element
	// that crosses to another lane.
	if (!Subtarget.hasAVX2()) {
	bool LaneCrossing[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
	if (!LaneCrossing[0] \|\| !LaneCrossing[1])
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
	} else {
	bool LaneUsed[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneUsed[(Mask[i] / LaneSize)] = true;
	if (!LaneUsed[0] \|\| !LaneUsed[1])
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	assert(V2.isUndef() &&
	"This last part of this routine only works on single input shuffles");

	SmallVector<int, 32> FlippedBlendMask(Size);
	for (int i = 0; i < Size; ++i)
	FlippedBlendMask[i] =
	Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
	? Mask[i]
	: Mask[i] % LaneSize +
	(i / LaneSize) * LaneSize + Size);

	// Flip the vector, and blend the results which should now be in-lane.
	MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
	SDValue Flipped = DAG.getBitcast(PVT, V1);
	Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
	{ 2, 3, 0, 1 });
	Flipped = DAG.getBitcast(VT, Flipped);
	return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
	}

	/// \brief Handle lowering 2-lane 128-bit shuffles.
	static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
	if (Subtarget.hasAVX2() && V2.isUndef())
	return SDValue();

	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, WidenedMask))
	return SDValue();

	// TODO: If minimizing size and one of the inputs is a zero vector and the
	// the zero vector has only one use, we could use a VPERM2X128 to save the
	// instruction bytes needed to explicitly generate the zero vector.

	// Blends are faster and handle all the non-lane-crossing cases.
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	bool IsLowZero = (Zeroable & 0x3) == 0x3;
	bool IsHighZero = (Zeroable & 0xc) == 0xc;

	// If either input operand is a zero vector, use VPERM2X128 because its mask
	// allows us to replace the zero input with an implicit zero.
	if (!IsLowZero && !IsHighZero) {
	// Check for patterns which can be matched with a single insert of a 128-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {

	// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
	// this will likely become vinsertf128 which can't fold a 256-bit memop.
	if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
	}
	}

	// Try to use SHUF128 if possible.
	if (Subtarget.hasVLX()) {
	if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
	unsigned PermMask = ((WidenedMask[0] % 2) << 0) \|
	((WidenedMask[1] % 2) << 1);
	return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
	DAG.getConstant(PermMask, DL, MVT::i8));
	}
	}
	}

	// Otherwise form a 128-bit permutation. After accounting for undefs,
	// convert the 64-bit shuffle mask selection values into 128-bit
	// selection bits by dividing the indexes by 2 and shifting into positions
	// defined by a vperm2*128 instruction's immediate control byte.

	// The immediate permute control byte looks like this:
	// [1:0] - select 128 bits from sources for low half of destination
	// [2] - ignore
	// [3] - zero low half of destination
	// [5:4] - select 128 bits from sources for high half of destination
	// [6] - ignore
	// [7] - zero high half of destination

	assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");

	unsigned PermMask = 0;
	PermMask \|= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
	PermMask \|= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

	// Check the immediate mask and replace unused sources with undef.
	if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
	V1 = DAG.getUNDEF(VT);
	if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
	V2 = DAG.getUNDEF(VT);

	return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
	/// shuffling each lane.
	///
	/// This will only succeed when the result of fixing the 128-bit lanes results
	/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
	/// each 128-bit lanes. This handles many cases where we can quickly blend away
	/// the lane crosses early and then use simpler shuffles within each lane.
	///
	/// FIXME: It might be worthwhile at some point to support this without
	/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
	/// in x86 only floating point has interesting non-repeating shuffles, and even
	/// those are still marginally more expensive.
	static SDValue lowerVectorShuffleByMerging128BitLanes(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This is only useful with multiple inputs.");

	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	int NumLanes = Size / LaneSize;
	assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");

	// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
	// check whether the in-128-bit lane shuffles share a repeating pattern.
	SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
	SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	int j = i / LaneSize;

	if (Lanes[j] < 0) {
	// First entry we've seen for this lane.
	Lanes[j] = Mask[i] / LaneSize;
	} else if (Lanes[j] != Mask[i] / LaneSize) {
	// This doesn't match the lane selected previously!
	return SDValue();
	}

	// Check that within each lane we have a consistent shuffle mask.
	int k = i % LaneSize;
	if (InLaneMask[k] < 0) {
	InLaneMask[k] = Mask[i] % LaneSize;
	} else if (InLaneMask[k] != Mask[i] % LaneSize) {
	// This doesn't fit a repeating in-lane mask.
	return SDValue();
	}
	}

	// First shuffle the lanes into place.
	MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
	VT.getSizeInBits() / 64);
	SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
	for (int i = 0; i < NumLanes; ++i)
	if (Lanes[i] >= 0) {
	LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
	LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
	}

	V1 = DAG.getBitcast(LaneVT, V1);
	V2 = DAG.getBitcast(LaneVT, V2);
	SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);

	// Cast it back to the type we actually want.
	LaneShuffle = DAG.getBitcast(VT, LaneShuffle);

	// Now do a simple shuffle that isn't lane crossing.
	SmallVector<int, 8> NewMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
	assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
	"Must not introduce lane crosses at this point!");

	return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
	}

	/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
	/// This allows for fast cases such as subvector extraction/insertion
	/// or shuffling smaller vector types which can lower more efficiently.
	static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected 256-bit or 512-bit vector");

	unsigned NumElts = VT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);

	bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
	bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
	if (!UndefLower && !UndefUpper)
	return SDValue();

	// Upper half is undef and lower half is whole upper subvector.
	// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
	if (UndefUpper &&
	isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(0, DL));
	}

	// Lower half is undef and upper half is whole lower subvector.
	// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
	if (UndefLower &&
	isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	}

	// If the shuffle only uses two of the four halves of the input operands,
	// then extract them and perform the 'half' shuffle at half width.
	// e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
	int HalfIdx1 = -1, HalfIdx2 = -1;
	SmallVector<int, 8> HalfMask(HalfNumElts);
	unsigned Offset = UndefLower ? HalfNumElts : 0;
	for (unsigned i = 0; i != HalfNumElts; ++i) {
	int M = Mask[i + Offset];
	if (M < 0) {
	HalfMask[i] = M;
	continue;
	}

	// Determine which of the 4 half vectors this element is from.
	// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
	int HalfIdx = M / HalfNumElts;

	// Determine the element index into its half vector source.
	int HalfElt = M % HalfNumElts;

	// We can shuffle with up to 2 half vectors, set the new 'half'
	// shuffle mask accordingly.
	if (HalfIdx1 < 0 \|\| HalfIdx1 == HalfIdx) {
	HalfMask[i] = HalfElt;
	HalfIdx1 = HalfIdx;
	continue;
	}
	if (HalfIdx2 < 0 \|\| HalfIdx2 == HalfIdx) {
	HalfMask[i] = HalfElt + HalfNumElts;
	HalfIdx2 = HalfIdx;
	continue;
	}

	// Too many half vectors referenced.
	return SDValue();
	}
	assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");

	// Only shuffle the halves of the inputs when useful.
	int NumLowerHalves =
	(HalfIdx1 == 0 \|\| HalfIdx1 == 2) + (HalfIdx2 == 0 \|\| HalfIdx2 == 2);
	int NumUpperHalves =
	(HalfIdx1 == 1 \|\| HalfIdx1 == 3) + (HalfIdx2 == 1 \|\| HalfIdx2 == 3);

	// uuuuXXXX - don't extract uppers just to insert again.
	if (UndefLower && NumUpperHalves != 0)
	return SDValue();

	// XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
	if (UndefUpper && NumUpperHalves == 2)
	return SDValue();

	// AVX2 - XXXXuuuu - always extract lowers.
	if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
	// AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();
	// AVX2 supports variable 32-bit element cross-lane shuffles.
	if (VT == MVT::v8f32 \|\| VT == MVT::v8i32) {
	// XXXXuuuu - don't extract lowers and uppers.
	if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
	return SDValue();
	}
	}

	// AVX512 - XXXXuuuu - always extract lowers.
	if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
	return SDValue();

	auto GetHalfVector = [&](int HalfIdx) {
	if (HalfIdx < 0)
	return DAG.getUNDEF(HalfVT);
	SDValue V = (HalfIdx < 2 ? V1 : V2);
	HalfIdx = (HalfIdx % 2) * HalfNumElts;
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
	DAG.getIntPtrConstant(HalfIdx, DL));
	};

	SDValue Half1 = GetHalfVector(HalfIdx1);
	SDValue Half2 = GetHalfVector(HalfIdx2);
	SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
	DAG.getIntPtrConstant(Offset, DL));
	}

	/// \brief Test whether the specified input (0 or 1) is in-place blended by the
	/// given mask.
	///
	/// This returns true if the elements from a particular input are already in the
	/// slot required by the given mask and require no permutation.
	static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
	assert((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles.");
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
	return false;

	return true;
	}

	/// Handle case where shuffle sources are coming from the same 128-bit lane and
	/// every lane can be represented as the same repeating mask - allowing us to
	/// shuffle the sources with the repeating shuffle and then permute the result
	/// to the destination lanes.
	static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;

	// On AVX2 we may be able to just shuffle the lowest elements and then
	// broadcast the result.
	if (Subtarget.hasAVX2()) {
	for (unsigned BroadcastSize : {16, 32, 64}) {
	if (BroadcastSize <= VT.getScalarSizeInBits())
	continue;
	int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

	// Attempt to match a repeating pattern every NumBroadcastElts,
	// accounting for UNDEFs but only references the lowest 128-bit
	// lane of the inputs.
	auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j) {
	int M = Mask[i + j];
	if (M < 0)
	continue;
	int &R = RepeatMask[j];
	if (0 != ((M % NumElts) / NumLaneElts))
	return false;
	if (0 <= R && R != M)
	return false;
	R = M;
	}
	return true;
	};

	SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
	if (!FindRepeatingBroadcastMask(RepeatMask))
	continue;

	// Shuffle the (lowest) repeated elements in place for broadcast.
	SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

	// Shuffle the actual broadcast.
	SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j)
	BroadcastMask[i + j] = j;
	return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
	BroadcastMask);
	}
	}

	// Bail if the shuffle mask doesn't cross 128-bit lanes.
	if (!is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	// Bail if we already have a repeated lane shuffle mask.
	SmallVector<int, 8> RepeatedShuffleMask;
	if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
	return SDValue();

	// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
	// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
	int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
	int NumSubLanes = NumLanes * SubLaneScale;
	int NumSubLaneElts = NumLaneElts / SubLaneScale;

	// Check that all the sources are coming from the same lane and see if we can
	// form a repeating shuffle mask (local to each sub-lane). At the same time,
	// determine the source sub-lane for each destination sub-lane.
	int TopSrcSubLane = -1;
	SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
	SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};

	for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
	// Extract the sub-lane mask, check that it all comes from the same lane
	// and normalize the mask entries to come from the first lane.
	int SrcLane = -1;
	SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
	if (M < 0)
	continue;
	int Lane = (M % NumElts) / NumLaneElts;
	if ((0 <= SrcLane) && (SrcLane != Lane))
	return SDValue();
	SrcLane = Lane;
	int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
	SubLaneMask[Elt] = LocalM;
	}

	// Whole sub-lane is UNDEF.
	if (SrcLane < 0)
	continue;

	// Attempt to match against the candidate repeated sub-lane masks.
	for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
	auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
	for (int i = 0; i != NumSubLaneElts; ++i) {
	if (M1[i] < 0 \|\| M2[i] < 0)
	continue;
	if (M1[i] != M2[i])
	return false;
	}
	return true;
	};

	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
	if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
	continue;

	// Merge the sub-lane mask into the matching repeated sub-lane mask.
	for (int i = 0; i != NumSubLaneElts; ++i) {
	int M = SubLaneMask[i];
	if (M < 0)
	continue;
	assert((RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) &&
	"Unexpected mask element");
	RepeatedSubLaneMask[i] = M;
	}

	// Track the top most source sub-lane - by setting the remaining to UNDEF
	// we can greatly simplify shuffle matching.
	int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
	TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
	Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
	break;
	}

	// Bail if we failed to find a matching repeated sub-lane mask.
	if (Dst2SrcSubLanes[DstSubLane] < 0)
	return SDValue();
	}
	assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
	"Unexpected source lane");

	// Create a repeating shuffle mask for the entire vector.
	SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
	for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
	int Lane = SubLane / SubLaneScale;
	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = RepeatedSubLaneMask[Elt];
	if (M < 0)
	continue;
	int Idx = (SubLane * NumSubLaneElts) + Elt;
	RepeatedMask[Idx] = M + (Lane * NumLaneElts);
	}
	}
	SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

	// Shuffle each source sub-lane to its destination.
	SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumSubLaneElts) {
	int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
	if (SrcSubLane < 0)
	continue;
	for (int j = 0; j != NumSubLaneElts; ++j)
	SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
	}

	return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
	SubLaneMask);
	}

	static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &ShuffleImm,
	ArrayRef<int> Mask) {
	int NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() == 64 &&
	(NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) &&
	"Unexpected data type for VSHUFPD");

	// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
	// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
	ShuffleImm = 0;
	bool ShufpdMask = true;
	bool CommutableMask = true;
	for (int i = 0; i < NumElts; ++i) {
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] < 0)
	return false;
	int Val = (i & 6) + NumElts * (i & 1);
	int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
	if (Mask[i] < Val \|\| Mask[i] > Val + 1)
	ShufpdMask = false;
	if (Mask[i] < CommutVal \|\| Mask[i] > CommutVal + 1)
	CommutableMask = false;
	ShuffleImm \|= (Mask[i] % 2) << i;
	}

	if (ShufpdMask)
	return true;
	if (CommutableMask) {
	std::swap(V1, V2);
	return true;
	}

	return false;
	}

	static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	assert((VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64)&&
	"Unexpected data type for VSHUFPD");

	unsigned Immediate = 0;
	if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
	return SDValue();

	return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	DAG.getConstant(Immediate, DL, MVT::i8));
	}

	/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	// With AVX2 we have direct support for this permutation.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Otherwise, fall back.
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op =
	lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return Op;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) \|\|
	isShuffleMaskInputInPlace(1, Mask))))
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return Result;
	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	// If we have AVX2 then we always want to lower with a blend because an v4 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
	Mask, DAG);

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v4i64 shuffling..
	static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");

	if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on both lanes.
	SmallVector<int, 2> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v4i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
	DAG.getBitcast(MVT::v8i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	// AVX2 provides a direct instruction for permuting a single input across
	// lanes.
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or VEXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;
	}

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!isShuffleMaskInputInPlace(0, Mask) &&
	!isShuffleMaskInputInPlace(1, Mask))
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
	Mask, DAG);
	}

	/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 &&
	"Repeated masks must be half the mask width!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
	// have already handled any direct blends.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have a single input shuffle with different shuffle patterns in the
	// two 128-bit lanes use the variable mask to VPERMILPS.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

	// Otherwise, fall back.
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return Result;
	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code using vpunpcklwd and
	// vpunpckhwd instrs than vblend.
	if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
	if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
	Mask, DAG))
	return V;

	// If we have AVX2 then we always want to lower with a blend because at v8 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
	Mask, DAG);

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v8i32 shuffling..
	static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code than vblend by using
	// vpunpcklwd and vpunpckhwd instrs.
	if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
	!Subtarget.hasAVX512())
	if (SDValue V =
	lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the two 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or EXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;
	}

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If the shuffle patterns aren't repeated but it is a single input, directly
	// generate a cross-lane VPERMD instruction.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
	SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v8i32, ShufPS);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
	Mask, DAG);
	}

	/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v16i16 shuffling..
	static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// There are no generalized cross-lane shuffle operations available on i16
	// element types.
	if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
	Mask, DAG, Subtarget);

	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v16 case.
	return lowerV8I16GeneralSingleInputVectorShuffle(
	DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512BWVL can lower to VPERMW.
	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v32i8 shuffling..
	static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There are no generalized cross-lane shuffle operations available on i8
	// element types.
	if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
	DAG, Subtarget);

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512VBMIVL can lower to VPERMB.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
	}

	/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 256-bit x86 vector
	/// shuffle or splits it into two 128-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = VT.getVectorNumElements();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There is a really nice hard cut-over between AVX1 and AVX2 that means we
	// can check for those subtargets here and avoid much of the subtarget
	// querying in the per-vector-type lowering routines. With AVX1 we have
	// essentially zero ability to manipulate a 256-bit vector with integer
	// types. Since we'll use floating point types there eventually, just
	// immediately cast everything to a float and operate entirely in that domain.
	if (VT.isInteger() && !Subtarget.hasAVX2()) {
	int ElementBits = VT.getScalarSizeInBits();
	if (ElementBits < 32) {
	// No floating point type available, if we can't use the bit operations
	// for masking/blending then decompose into 128-bit vectors.
	if (SDValue V =
	lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
	return V;
	if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return V;
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
	VT.getVectorNumElements());
	V1 = DAG.getBitcast(FpVT, V1);
	V2 = DAG.getBitcast(FpVT, V2);
	return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
	}

	switch (VT.SimpleTy) {
	case MVT::v4f64:
	return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i64:
	return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8f32:
	return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i32:
	return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i16:
	return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i8:
	return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 256-bit x86 vector type!");
	}
	}

	/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
	static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	assert(VT.getScalarSizeInBits() == 64 &&
	"Unexpected element type size for 128bit shuffle.");

	// To handle 256 bit vector requires VLX and most probably
	// function lowerV2X128VectorShuffle() is better solution.
	assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");

	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, WidenedMask))
	return SDValue();

	// Check for patterns which can be matched with a single insert of a 256-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 0, 1, 2, 3});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 8, 9, 10, 11})) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
	}

	assert(WidenedMask.size() == 4);

	// See if this is an insertion of the lower 128-bits of V2 into V1.
	bool IsInsert = true;
	int V2Index = -1;
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	// Make sure all V1 subvectors are in place.
	if (WidenedMask[i] < 4) {
	if (WidenedMask[i] != i) {
	IsInsert = false;
	break;
	}
	} else {
	// Make sure we only have a single V2 index and its the lowest 128-bits.
	if (V2Index >= 0 \|\| WidenedMask[i] != 4) {
	IsInsert = false;
	break;
	}
	V2Index = i;
	}
	}
	if (IsInsert && V2Index >= 0) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
	DAG.getIntPtrConstant(0, DL));
	return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
	}

	// Try to lower to to vshuf64x2/vshuf32x4.
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
	unsigned PermMask = 0;
	// Insure elements came from the same Op.
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
	unsigned OpIndex = i / 2;
	if (Ops[OpIndex].isUndef())
	Ops[OpIndex] = Op;
	else if (Ops[OpIndex] != Op)
	return SDValue();

	// Convert the 128-bit shuffle mask selection values into 128-bit selection
	// bits defined by a vshuf64x2 instruction's immediate control byte.
	PermMask \|= (WidenedMask[i] % 4) << (i * 2);
	}

	return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
	static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3) \|
	((Mask[4] == 5) << 4) \| ((Mask[5] == 5) << 5) \|
	((Mask[6] == 7) << 6) \| ((Mask[7] == 7) << 7);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
	}

	if (SDValue Shuf128 =
	lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Shuf128;

	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Unpck;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op =
	lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Op;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
	V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
	static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
	return Unpck;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Otherwise, fall back to a SHUFPS sequence.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
	if (V2.isUndef() &&
	!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
	SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
	}

	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
	static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on all four
	// 128-bit lanes.
	SmallVector<int, 2> Repeated128Mask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v8i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
	DAG.getBitcast(MVT::v16i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	SmallVector<int, 4> Repeated256Mask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
	getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
	}

	if (SDValue Shuf128 =
	lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Shuf128;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Unpck;
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
	V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
	static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the four 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Try to use byte rotation instructions.
	if (Subtarget.hasBWI())
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Assume that a single SHUFPS is faster than using a permv shuffle.
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
	SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v16i32, ShufPS);
	}
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;
	return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
	static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (V2.isUndef()) {
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v32 case.
	return lowerV8I16GeneralSingleInputVectorShuffle(
	DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
	static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// VBMI can use VPERMV/VPERMV3 byte shuffles.
	if (Subtarget.hasVBMI())
	return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// FIXME: Implement direct support for this type!
	return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
	}

	/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 512-bit x86 vector
	/// shuffle or splits it into two 256-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/ basic ISA!");

	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = Mask.size();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast =
	lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Dispatch to each element type for lowering. If we don't have support for
	// specific element type shuffles at 512 bits, immediately split them and
	// lower them. Each lowering routine of a given type is allowed to assume that
	// the requisite ISA extensions for that element type are available.
	switch (VT.SimpleTy) {
	case MVT::v8f64:
	return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16f32:
	return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i64:
	return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i32:
	return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i16:
	return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v64i8:
	return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 512-bit x86 vector type!");
	}
	}

	// Lower vXi1 vector shuffles.
	// There is no a dedicated instruction on AVX-512 that shuffles the masks.
	// The only way to shuffle bits is to sign-extend the mask vector to SIMD
	// vector, shuffle and then truncate it back.
	static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/o basic ISA!");
	MVT ExtVT;
	switch (VT.SimpleTy) {
	default:
	llvm_unreachable("Expected a vector of i1 elements");
	case MVT::v2i1:
	ExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	ExtVT = MVT::v4i32;
	break;
	case MVT::v8i1:
	// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
	// shuffle.
	ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
	break;
	case MVT::v16i1:
	ExtVT = MVT::v16i32;
	break;
	case MVT::v32i1:
	ExtVT = MVT::v32i16;
	break;
	case MVT::v64i1:
	ExtVT = MVT::v64i8;
	break;
	}

	if (ISD::isBuildVectorAllZeros(V1.getNode()))
	V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
	else if (ISD::isBuildVectorAllOnes(V1.getNode()))
	V1 = getOnesVector(ExtVT, DAG, DL);
	else
	V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

	if (V2.isUndef())
	V2 = DAG.getUNDEF(ExtVT);
	else if (ISD::isBuildVectorAllZeros(V2.getNode()))
	V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
	else if (ISD::isBuildVectorAllOnes(V2.getNode()))
	V2 = getOnesVector(ExtVT, DAG, DL);
	else
	V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

	SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
	// i1 was sign extended we can use X86ISD::CVT2MASK.
	int NumElems = VT.getVectorNumElements();
	if ((Subtarget.hasBWI() && (NumElems >= 32)) \|\|
	(Subtarget.hasDQI() && (NumElems < 32)))
	return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);

	return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
	}

	/// Helper function that returns true if the shuffle mask should be
	/// commuted to improve canonicalization.
	static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
	int NumElements = Mask.size();

	int NumV1Elements = 0, NumV2Elements = 0;
	for (int M : Mask)
	if (M < 0)
	continue;
	else if (M < NumElements)
	++NumV1Elements;
	else
	++NumV2Elements;

	// Commute the shuffle as needed such that more elements come from V1 than
	// V2. This allows us to match the shuffle pattern strictly on how many
	// elements come from V1 without handling the symmetric cases.
	if (NumV2Elements > NumV1Elements)
	return true;

	assert(NumV1Elements > 0 && "No V1 indices");

	if (NumV2Elements == 0)
	return false;

	// When the number of V1 and V2 elements are the same, try to minimize the
	// number of uses of V2 in the low half of the vector. When that is tied,
	// ensure that the sum of indices for V1 is equal to or lower than the sum
	// indices for V2. When those are equal, try to ensure that the number of odd
	// indices for V1 is lower than the number of odd indices for V2.
	if (NumV1Elements == NumV2Elements) {
	int LowV1Elements = 0, LowV2Elements = 0;
	for (int M : Mask.slice(0, NumElements / 2))
	if (M >= NumElements)
	++LowV2Elements;
	else if (M >= 0)
	++LowV1Elements;
	if (LowV2Elements > LowV1Elements)
	return true;
	if (LowV2Elements == LowV1Elements) {
	int SumV1Indices = 0, SumV2Indices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	SumV2Indices += i;
	else if (Mask[i] >= 0)
	SumV1Indices += i;
	if (SumV2Indices < SumV1Indices)
	return true;
	if (SumV2Indices == SumV1Indices) {
	int NumV1OddIndices = 0, NumV2OddIndices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	NumV2OddIndices += i % 2;
	else if (Mask[i] >= 0)
	NumV1OddIndices += i % 2;
	if (NumV2OddIndices < NumV1OddIndices)
	return true;
	}
	}
	}

	return false;
	}

	/// \brief Top-level lowering for x86 vector shuffles.
	///
	/// This handles decomposition, canonicalization, and lowering of all x86
	/// vector shuffles. Most of the specific lowering strategies are encapsulated
	/// above in helper routines. The canonicalization attempts to widen shuffles
	/// to involve fewer lanes of wider elements, consolidate symmetric patterns
	/// s.t. only one of the two inputs needs to be tested, etc.
	static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
	ArrayRef<int> Mask = SVOp->getMask();
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	MVT VT = Op.getSimpleValueType();
	int NumElements = VT.getVectorNumElements();
	SDLoc DL(Op);
	bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

	assert((VT.getSizeInBits() != 64 \|\| Is1BitVector) &&
	"Can't lower MMX shuffles");

	bool V1IsUndef = V1.isUndef();
	bool V2IsUndef = V2.isUndef();
	if (V1IsUndef && V2IsUndef)
	return DAG.getUNDEF(VT);

	// When we create a shuffle node we put the UNDEF node to second operand,
	// but in some cases the first operand may be transformed to UNDEF.
	// In this case we should just commute the node.
	if (V1IsUndef)
	return DAG.getCommutedVectorShuffle(*SVOp);

	// Check for non-undef masks pointing at an undef vector and make the masks
	// undef as well. This makes it easier to match the shuffle based solely on
	// the mask.
	if (V2IsUndef)
	for (int M : Mask)
	if (M >= NumElements) {
	SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
	for (int &M : NewMask)
	if (M >= NumElements)
	M = -1;
	return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	}

	// Check for illegal shuffle mask element index values.
	int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
	assert(llvm::all_of(Mask,
	[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
	"Out of bounds shuffle index");

	// We actually see shuffles that are entirely re-arrangements of a set of
	// zero inputs. This mostly happens while decomposing complex shuffles into
	// simple ones. Directly lower these as a buildvector of zeros.
	APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
	if (Zeroable.isAllOnesValue())
	return getZeroVector(VT, Subtarget, DAG, DL);

	// Try to collapse shuffles into using a vector type with fewer elements but
	// wider element types. We cap this to not form integers or floating point
	// elements wider than 64 bits, but it might be interesting to form i128
	// integers to handle flipping the low and high halves of AVX 256-bit vectors.
	SmallVector<int, 16> WidenedMask;
	if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
	canWidenShuffleElements(Mask, WidenedMask)) {
	MVT NewEltVT = VT.isFloatingPoint()
	? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
	: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
	MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
	// Make sure that the new vector type is legal. For example, v2f64 isn't
	// legal on SSE1.
	if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
	V1 = DAG.getBitcast(NewVT, V1);
	V2 = DAG.getBitcast(NewVT, V2);
	return DAG.getBitcast(
	VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
	}
	}

	// Commute the shuffle if it will improve canonicalization.
	if (canonicalizeShuffleMaskWithCommute(Mask))
	return DAG.getCommutedVectorShuffle(*SVOp);

	// For each vector width, delegate to a specialized lowering routine.
	if (VT.is128BitVector())
	return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (VT.is256BitVector())
	return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (VT.is512BitVector())
	return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (Is1BitVector)
	return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);

	llvm_unreachable("Unimplemented!");
	}

	/// \brief Try to lower a VSELECT instruction to a vector shuffle.
	static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return SDValue();
	auto *CondBV = cast<BuildVectorSDNode>(Cond);

	// Only non-legal VSELECTs reach this lowering, convert those into generic
	// shuffles and re-use the shuffle lowering path for blends.
	SmallVector<int, 32> Mask;
	for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
	SDValue CondElt = CondBV->getOperand(i);
	Mask.push_back(
	isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
	: -1);
	}
	return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
	}

	SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
	// A vselect where all conditions and data are constants can be optimized into
	// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
	return SDValue();

	// Try to lower this to a blend-style vector shuffle. This can handle all
	// constant condition cases.
	if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
	return BlendOp;

	// If this VSELECT has a vector if i1 as a mask, it will be directly matched
	// with patterns on the mask registers on AVX-512.
	if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
	return Op;

	// Variable blends are only legal from SSE4.1 onward.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
	// into an i1 condition so that we can use the mask-based 512-bit blend
	// instructions.
	if (VT.getSizeInBits() == 512) {
	SDValue Cond = Op.getOperand(0);
	// The vNi1 condition case should be handled above as it can be trivially
	// lowered.
	assert(Cond.getValueType().getScalarSizeInBits() ==
	VT.getScalarSizeInBits() &&
	"Should have a size-matched integer condition!");
	// Build a mask by testing the condition against itself (tests for zero).
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
	// Now return a new VSELECT using the mask.
	return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
	}

	// Only some types will be legal on some subtargets. If we can emit a legal
	// VSELECT-matching blend, return Op, and but if we need to expand, return
	// a null value.
	switch (VT.SimpleTy) {
	default:
	// Most of the vector types have blends past SSE4.1.
	return Op;

	case MVT::v32i8:
	// The byte blends for AVX vectors were introduced only in AVX2.
	if (Subtarget.hasAVX2())
	return Op;

	return SDValue();

	case MVT::v8i16:
	case MVT::v16i16:
	// FIXME: We should custom lower this by fixing the condition and using i8
	// blends.
	return SDValue();
	}
	}

	static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
	return SDValue();

	if (VT.getSizeInBits() == 8) {
	SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (VT == MVT::f32) {
	// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
	// the result back to FR32 register. It's only worth matching if the
	// result has a single use which is a store or a bitcast to i32. And in
	// the case of a store, it's not worth it if the index is a constant 0,
	// because a MOVSSmr can be used instead, which is smaller and faster.
	if (!Op.hasOneUse())
	return SDValue();
	SDNode User = Op.getNode()->use_begin();
	if ((User->getOpcode() != ISD::STORE \|\|
	isNullConstant(Op.getOperand(1))) &&
	(User->getOpcode() != ISD::BITCAST \|\|
	User->getValueType(0) != MVT::i32))
	return SDValue();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
	Op.getOperand(1));
	return DAG.getBitcast(MVT::f32, Extract);
	}

	if (VT == MVT::i32 \|\| VT == MVT::i64) {
	// ExtractPS/pextrq works with constant index.
	if (isa<ConstantSDNode>(Op.getOperand(1)))
	return Op;
	}

	return SDValue();
	}

	/// Extract one bit from mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Vec = Op.getOperand(0);
	SDLoc dl(Vec);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);
	MVT EltVT = Op.getSimpleValueType();

	assert((VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI()) &&
	"Unexpected vector type in ExtractBitFromMaskVector");

	// variable index can't be handled in mask registers,
	// extend vector to VR512/128
	if (!isa<ConstantSDNode>(Idx)) {
	unsigned NumElts = VecVT.getVectorNumElements();
	// Extending v8i1/v16i1 to 512-bit get better performance on KNL
	// than extending to 128/256bit.
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
	}

	// Canonicalize result type to MVT::i32.
	if (EltVT != MVT::i32) {
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	Vec, Idx);
	return DAG.getAnyExtOrTrunc(Extract, dl, EltVT);
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	// Extracts from element 0 are always allowed.
	if (IdxVal == 0)
	return Op;

	// If the kshift instructions of the correct width aren't natively supported
	// then we need to promote the vector to the native size to get the correct
	// zeroing behavior.
	if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) \|\|
	(VecVT.getVectorNumElements() < 8)) {
	VecVT = MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
	DAG.getUNDEF(VecVT),
	Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Use kshiftr instruction to move to the lower element.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue
	X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);

	if (VecVT.getVectorElementType() == MVT::i1)
	return ExtractBitFromMaskVector(Op, DAG, Subtarget);

	if (!isa<ConstantSDNode>(Idx)) {
	// Its more profitable to go through memory (1 cycles throughput)
	// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
	// IACA tool was used to get performance estimation
	// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
	//
	// example : extractelement <16 x i8> %a, i32 %i
	//
	// Block Throughput: 3.00 Cycles
	// Throughput Bottleneck: Port5
	//
	// \| Num Of \| Ports pressure in cycles \| \|
	// \| Uops \| 0 - DV \| 5 \| 6 \| 7 \| \|
	// ---------------------------------------------
	// \| 1 \| \| 1.0 \| \| \| CP \| vmovd xmm1, edi
	// \| 1 \| \| 1.0 \| \| \| CP \| vpshufb xmm0, xmm0, xmm1
	// \| 2 \| 1.0 \| 1.0 \| \| \| CP \| vpextrb eax, xmm0, 0x0
	// Total Num Of Uops: 4
	//
	//
	// Block Throughput: 1.00 Cycles
	// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
	//
	// \| \| Ports pressure in cycles \| \|
	// \|Uops\| 1 \| 2 - D \|3 - D \| 4 \| 5 \| \|
	// ---------------------------------------------------------
	// \|2^ \| \| 0.5 \| 0.5 \|1.0\| \|CP\| vmovaps xmmword ptr [rsp-0x18], xmm0
	// \|1 \|0.5\| \| \| \|0.5\| \| lea rax, ptr [rsp-0x18]
	// \|1 \| \|0.5, 0.5\|0.5, 0.5\| \| \|CP\| mov al, byte ptr [rdi+rax*1]
	// Total Num Of Uops: 4

	return SDValue();
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	// If this is a 256-bit vector result, first extract the 128-bit vector and
	// then extract the element from the 128-bit vector.
	if (VecVT.is256BitVector() \|\| VecVT.is512BitVector()) {
	// Get the 128-bit vector.
	Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
	MVT EltVT = VecVT.getVectorElementType();

	unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
	// this can be done with a mask.
	IdxVal &= ElemsPerChunk - 1;
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getConstant(IdxVal, dl, MVT::i32));
	}

	assert(VecVT.is128BitVector() && "Unexpected vector length");

	MVT VT = Op.getSimpleValueType();

	if (VT.getSizeInBits() == 16) {
	// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
	// we're going to zero extend the register or fold the store (SSE41 only).
	if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
	!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx));

	// Transform it so it match pextrw which produces a 32-bit result.
	SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (Subtarget.hasSSE41())
	if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
	return Res;

	// TODO: We only extract a single element from v16i8, we can probably afford
	// to be more aggressive here before using the default approach of spilling to
	// stack.
	if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
	// Extract either the lowest i32 or any i16, and extract the sub-byte.
	int DWordIdx = IdxVal / 4;
	if (DWordIdx == 0) {
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec),
	DAG.getIntPtrConstant(DWordIdx, dl));
	int ShiftVal = (IdxVal % 4) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i32));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	int WordIdx = IdxVal / 2;
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
	DAG.getBitcast(MVT::v8i16, Vec),
	DAG.getIntPtrConstant(WordIdx, dl));
	int ShiftVal = (IdxVal % 2) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i16));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	if (VT.getSizeInBits() == 32) {
	if (IdxVal == 0)
	return Op;

	// SHUFPS the element to the lowest double word, then movss.
	int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() == 64) {
	// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
	// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
	// to match extract_elt for f64.
	if (IdxVal == 0)
	return Op;

	// UNPCKHPD the element to the lowest double word, then movsd.
	// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
	// to a f64mem, the whole operation is folded into a single MOVHPDmr.
	int Mask[2] = { 1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	return SDValue();
	}

	/// Insert one bit to mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Elt = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	MVT VecVT = Vec.getSimpleValueType();

	if (!isa<ConstantSDNode>(Idx)) {
	// Non constant index. Extend source and destination,
	// insert element and then truncate the result.
	unsigned NumElts = VecVT.getVectorNumElements();
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	unsigned NumElems = VecVT.getVectorNumElements();

	// If the kshift instructions of the correct width aren't natively supported
	// then we need to promote the vector to the native size to get the correct
	// zeroing behavior.
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| (NumElems < 8)) {
	// Need to promote to v16i1, do the insert, then extract back.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
	DAG.getUNDEF(MVT::v16i1), Vec,
	DAG.getIntPtrConstant(0, dl));
	Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);

	if (Vec.isUndef()) {
	if (IdxVal)
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	return EltInVec;
	}

	// Insertion of one bit into first position
	if (IdxVal == 0 ) {
	// Clean top bits of vector.
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(NumElems - 1, dl, MVT::i8));
	EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
	DAG.getConstant(NumElems - 1, dl, MVT::i8));
	// Clean the first bit in source vector.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(1 , dl, MVT::i8));
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
	DAG.getConstant(1, dl, MVT::i8));

	return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
	}
	// Insertion of one bit into last position
	if (IdxVal == NumElems - 1) {
	// Move the bit to the last position inside the vector.
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	// Clean the last bit in the source vector.
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
	DAG.getConstant(1, dl, MVT::i8));
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(1 , dl, MVT::i8));

	return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
	}

	// Move the current value of the bit to be replace to bit 0.
	SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	// Xor with the new bit.
	Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec);
	// Shift to MSB, filling bottom bits with 0.
	Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged,
	DAG.getConstant(NumElems - 1, dl, MVT::i8));
	// Shift to the final position, filling upper bits with 0.
	Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged,
	DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8));
	// Xor with original vector to cancel out the original bit value that's still
	// present.
	return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec);
	}

	SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();

	if (EltVT == MVT::i1)
	return InsertBitToMaskVector(Op, DAG, Subtarget);

	SDLoc dl(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	SDValue N2 = Op.getOperand(2);
	if (!isa<ConstantSDNode>(N2))
	return SDValue();
	auto *N2C = cast<ConstantSDNode>(N2);
	unsigned IdxVal = N2C->getZExtValue();

	bool IsZeroElt = X86::isZeroNode(N1);
	bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

	// If we are inserting a element, see if we can do this more efficiently with
	// a blend shuffle with a rematerializable vector than a costly integer
	// insertion.
	if ((IsZeroElt \|\| IsAllOnesElt) && Subtarget.hasSSE41() &&
	16 <= EltVT.getSizeInBits()) {
	SmallVector<int, 8> BlendMask;
	for (unsigned i = 0; i != NumElts; ++i)
	BlendMask.push_back(i == IdxVal ? i + NumElts : i);
	SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
	: getOnesVector(VT, DAG, dl);
	return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
	}

	// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
	// into that, and then insert the subvector back into the result.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	// With a 256-bit vector, we can insert into the zero element efficiently
	// using a blend if we have AVX or AVX2 and the right data type.
	if (VT.is256BitVector() && IdxVal == 0) {
	// TODO: It is worthwhile to cast integer to floating point and back
	// and incur a domain crossing penalty if that's what we'll end up
	// doing anyway after extracting to a 128-bit vector.
	if ((Subtarget.hasAVX() && (EltVT == MVT::f64 \|\| EltVT == MVT::f32)) \|\|
	(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
	SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	N2 = DAG.getIntPtrConstant(1, dl);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
	}
	}

	// Get the desired 128-bit vector chunk.
	SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

	// Insert the element into the desired chunk.
	unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(NumEltsIn128));
	// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
	unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
	DAG.getConstant(IdxIn128, dl, MVT::i32));

	// Insert the changed part back into the bigger vector
	return insert128BitVector(N0, V, IdxVal, DAG, dl);
	}
	assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");

	// Transform it so it match pinsr{b,w} which expects a GR32 as its second
	// argument. SSE41 required for pinsrb.
	if (VT == MVT::v8i16 \|\| (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
	unsigned Opc;
	if (VT == MVT::v8i16) {
	assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
	Opc = X86ISD::PINSRW;
	} else {
	assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
	assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
	Opc = X86ISD::PINSRB;
	}

	if (N1.getValueType() != MVT::i32)
	N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
	if (N2.getValueType() != MVT::i32)
	N2 = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(Opc, dl, VT, N0, N1, N2);
	}

	if (Subtarget.hasSSE41()) {
	if (EltVT == MVT::f32) {
	// Bits [7:6] of the constant are the source select. This will always be
	// zero here. The DAG Combiner may combine an extract_elt index into
	// these bits. For example (insert (extract, 3), 2) could be matched by
	// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
	// Bits [5:4] of the constant are the destination select. This is the
	// value of the incoming immediate.
	// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
	// combine either bitwise AND or insert of float 0.0 to set these bits.

	bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
	if (IdxVal == 0 && (!MinSize \|\| !MayFoldLoad(N1))) {
	// If this is an insertion of 32-bits into the low 32-bits of
	// a vector, we prefer to generate a blend with immediate rather
	// than an insertps. Blends are simpler operations in hardware and so
	// will always have equal or better performance than insertps.
	// But if optimizing for size and there's a load folding opportunity,
	// generate insertps because blendps does not have a 32-bit memory
	// operand form.
	N2 = DAG.getIntPtrConstant(1, dl);
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
	}
	N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
	// Create this as a scalar to vector..
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
	}

	// PINSR* works with constant index.
	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT OpVT = Op.getSimpleValueType();

	// It's always cheaper to replace a xor+movd with xorps and simplifies further
	// combines.
	if (X86::isZeroNode(Op.getOperand(0)))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	// If this is a 256-bit vector result, first insert into a 128-bit
	// vector and then insert into the 256-bit vector.
	if (!OpVT.is128BitVector()) {
	// Insert into a 128-bit vector.
	unsigned SizeFactor = OpVT.getSizeInBits() / 128;
	MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
	OpVT.getVectorNumElements() / SizeFactor);

	Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

	// Insert the 128-bit vector.
	return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
	}
	assert(OpVT.is128BitVector() && "Expected an SSE type!");

	// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
	if (OpVT == MVT::v4i32)
	return Op;

	SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
	return DAG.getBitcast(
	OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
	}

	// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
	// simple superregister reference or explicit instructions to insert
	// the upper bits of a vector.
	static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);

	return insert1BitVector(Op, DAG, Subtarget);
	}

	static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Only vXi1 extract_subvectors need custom lowering");

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0) // the operation is legal
	return Op;

	MVT VecVT = Vec.getSimpleValueType();
	unsigned NumElems = VecVT.getVectorNumElements();

	// Extend to natively supported kshift.
	MVT WideVecVT = VecVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8) {
	WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
	DAG.getUNDEF(WideVecVT), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Shift to the LSB.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));

	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Returns the appropriate wrapper opcode for a global reference.
	unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
	// References to absolute symbols are never PC-relative.
	if (GV && GV->isAbsoluteSymbolRef())
	return X86ISD::Wrapper;

	CodeModel::Model M = getTargetMachine().getCodeModel();
	if (Subtarget.isPICStyleRIPRel() &&
	(M == CodeModel::Small \|\| M == CodeModel::Kernel))
	return X86ISD::WrapperRIP;

	return X86ISD::Wrapper;
	}

	// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
	// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
	// one of the above mentioned nodes. It has to be wrapped because otherwise
	// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
	// be used to form addressing mode. These wrapped nodes will be selected
	// into MOV32ri.
	SDValue
	X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetConstantPool(
	CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
	SDLoc DL(CP);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
	// With PIC, the address is actually $g + Offset.
	if (OpFlag) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
	SDLoc DL(JT);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (OpFlag)
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

	return Result;
	}

	SDValue
	X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
	const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
	unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);

	SDLoc DL(Op);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isPositionIndependent() && !Subtarget.is64Bit()) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	// For symbols that require a load from a stub to get the address, emit the
	// load.
	if (isGlobalStubReference(OpFlag))
	Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
	// Create the TargetBlockAddressAddress node.
	unsigned char OpFlags =
	Subtarget.classifyBlockAddressReference();
	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
	int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
	Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
	const SDLoc &dl, int64_t Offset,
	SelectionDAG &DAG) const {
	// Create the TargetGlobalAddress node, folding in the constant
	// offset if it is legal.
	unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
	CodeModel::Model M = DAG.getTarget().getCodeModel();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result;
	if (OpFlags == X86II::MO_NO_FLAG &&
	X86::isOffsetSuitableForCodeModel(Offset, M)) {
	// A direct static reference to a global.
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
	Offset = 0;
	} else {
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
	}

	Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	// For globals that require a load from a stub to get the address, emit the
	// load.
	if (isGlobalStubReference(OpFlags))
	Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	// If there was a non-zero offset that we didn't fold, create an explicit
	// addition for it.
	if (Offset != 0)
	Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
	DAG.getConstant(Offset, dl, PtrVT));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
	int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
	return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
	}

	static SDValue
	GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
	SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
	unsigned char OperandFlags, bool LocalDynamic = false) {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDLoc dl(GA);
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(),
	OperandFlags);

	X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
	: X86ISD::TLSADDR;

	if (InFlag) {
	SDValue Ops[] = { Chain, TGA, *InFlag };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	} else {
	SDValue Ops[] = { Chain, TGA };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	}

	// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
	MFI.setAdjustsStack(true);
	MFI.setHasCalls(true);

	SDValue Flag = Chain.getValue(1);
	return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
	static SDValue
	LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	SDValue InFlag;
	SDLoc dl(GA); // ? function entry point might be better
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg,
	SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);

	return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
	static SDValue
	LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
	X86::RAX, X86II::MO_TLSGD);
	}

	static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
	SelectionDAG &DAG,
	const EVT PtrVT,
	bool is64Bit) {
	SDLoc dl(GA);

	// Get the start address of the TLS block for this module.
	X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
	.getInfo<X86MachineFunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	SDValue Base;
	if (is64Bit) {
	Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
	X86II::MO_TLSLD, /LocalDynamic=/true);
	} else {
	SDValue InFlag;
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);
	Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
	X86II::MO_TLSLDM, /LocalDynamic=/true);
	}

	// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
	// of Base.

	// Build x@dtpoff.
	unsigned char OperandFlags = X86II::MO_DTPOFF;
	unsigned WrapperKind = X86ISD::Wrapper;
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	// Add x@dtpoff with the base.
	return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
	}

	// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
	static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT, TLSModel::Model model,
	bool is64Bit, bool isPIC) {
	SDLoc dl(GA);

	// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
	Value Ptr = Constant::getNullValue(Type::getInt8PtrTy(DAG.getContext(),
	is64Bit ? 257 : 256));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
	MachinePointerInfo(Ptr));

	unsigned char OperandFlags = 0;
	// Most TLS accesses are not RIP relative, even on x86-64. One exception is
	// initialexec.
	unsigned WrapperKind = X86ISD::Wrapper;
	if (model == TLSModel::LocalExec) {
	OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
	} else if (model == TLSModel::InitialExec) {
	if (is64Bit) {
	OperandFlags = X86II::MO_GOTTPOFF;
	WrapperKind = X86ISD::WrapperRIP;
	} else {
	OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
	}
	} else {
	llvm_unreachable("Unexpected model");
	}

	// emit "addl x@ntpoff,%eax" (local exec)
	// or "addl x@indntpoff,%eax" (initial exec)
	// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
	SDValue TGA =
	DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	if (model == TLSModel::InitialExec) {
	if (isPIC && !is64Bit) {
	Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);
	}

	Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
	}

	SDValue
	X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	if (DAG.getTarget().Options.EmulatedTLS)
	return LowerToTLSEmulatedModel(GA, DAG);

	const GlobalValue *GV = GA->getGlobal();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool PositionIndependent = isPositionIndependent();

	if (Subtarget.isTargetELF()) {
	TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
	switch (model) {
	case TLSModel::GeneralDynamic:
	if (Subtarget.is64Bit())
	return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
	return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
	case TLSModel::LocalDynamic:
	return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
	Subtarget.is64Bit());
	case TLSModel::InitialExec:
	case TLSModel::LocalExec:
	return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
	PositionIndependent);
	}
	llvm_unreachable("Unknown TLS model.");
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin only has one model of TLS. Lower to that.
	unsigned char OpFlag = 0;
	unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
	X86ISD::WrapperRIP : X86ISD::Wrapper;

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
	if (PIC32)
	OpFlag = X86II::MO_TLVP_PIC_BASE;
	else
	OpFlag = X86II::MO_TLVP;
	SDLoc DL(Op);
	SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
	GA->getValueType(0),
	GA->getOffset(), OpFlag);
	SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

	// With PIC32, the address is actually $g + Offset.
	if (PIC32)
	Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);

	// Lowering the machine isd will make sure everything is in the right
	// location.
	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
	SDValue Args[] = { Chain, Offset };
	Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
	DAG.getIntPtrConstant(0, DL, true),
	Chain.getValue(1), DL);

	// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// And our return value (tls address) is in the standard call return value
	// location.
	unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
	}

	if (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium() \|\|
	Subtarget.isTargetWindowsGNU()) {
	// Just use the implicit TLS architecture
	// Need to generate something similar to:
	// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
	// ; from TEB
	// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
	// mov rcx, qword [rdx+rcx*8]
	// mov eax, .tls$:tlsvar
	// [rax+rcx] contains the address
	// Windows 64bit: gs:0x58
	// Windows 32bit: fs:__tls_array

	SDLoc dl(GA);
	SDValue Chain = DAG.getEntryNode();

	// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
	// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
	// use its literal value of 0x2C.
	Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
	? Type::getInt8PtrTy(*DAG.getContext(),
	256)
	: Type::getInt32PtrTy(*DAG.getContext(),
	257));

	SDValue TlsArray = Subtarget.is64Bit()
	? DAG.getIntPtrConstant(0x58, dl)
	: (Subtarget.isTargetWindowsGNU()
	? DAG.getIntPtrConstant(0x2C, dl)
	: DAG.getExternalSymbol("_tls_array", PtrVT));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

	SDValue res;
	if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
	res = ThreadPointer;
	} else {
	// Load the _tls_index variable
	SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
	if (Subtarget.is64Bit())
	IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
	MachinePointerInfo(), MVT::i32);
	else
	IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

	auto &DL = DAG.getDataLayout();
	SDValue Scale =
	DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
	IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

	res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
	}

	res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

	// Get the offset of start of .tls section
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), X86II::MO_SECREL);
	SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
	}

	llvm_unreachable("TLS not implemented for this target.");
	}

	/// Lower SRA_PARTS and friends, which return two i32 values
	/// and take a 2 x i32 value to shift plus a shift amount.
	static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	MVT VT = Op.getSimpleValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	// X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
	// generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
	// during isel.
	SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits - 1, dl, MVT::i8));
	SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i8))
	: DAG.getConstant(0, dl, VT);

	SDValue Tmp2, Tmp3;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
	} else {
	Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
	Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
	}

	// If the shift amount is larger or equal than the width of a part we can't
	// rely on the results of shld/shrd. Insert a test and select the appropriate
	// values for large shift amounts.
	SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i8));
	SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	AndNode, DAG.getConstant(0, dl, MVT::i8));

	SDValue Hi, Lo;
	SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
	SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };

	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
	Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
	} else {
	Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
	Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
	}

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (SrcVT.isVector()) {
	if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
	return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(SrcVT)));
	}
	if (SrcVT == MVT::v2i1) {
	// For v2i1, we need to widen to v4i1 first.
	assert(VT == MVT::v2f64 && "Unexpected type");
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Src,
	DAG.getUNDEF(MVT::v2i1));
	return DAG.getNode(X86ISD::CVTSI2P, dl, Op.getValueType(),
	DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Src));
	}
	return SDValue();
	}

	assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
	"Unknown SINT_TO_FP to lower!");

	// These are really Legal; return the operand so the caller accepts it as
	// Legal.
	if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
	return Op;
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
	Subtarget.is64Bit()) {
	return Op;
	}

	SDValue ValueToStore = Op.getOperand(0);
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
	!Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

	unsigned Size = SrcVT.getSizeInBits()/8;
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	SDValue Chain = DAG.getStore(
	DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
	}

	SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
	SDValue StackSlot,
	SelectionDAG &DAG) const {
	// Build the FILD
	SDLoc DL(Op);
	SDVTList Tys;
	bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
	if (useSSE)
	Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
	else
	Tys = DAG.getVTList(Op.getValueType(), MVT::Other);

	unsigned ByteSize = SrcVT.getSizeInBits()/8;

	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
	MachineMemOperand *MMO;
	if (FI) {
	int SSFI = FI->getIndex();
	MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, ByteSize, ByteSize);
	} else {
	MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
	StackSlot = StackSlot.getOperand(1);
	}
	SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
	SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
	X86ISD::FILD, DL,
	Tys, Ops, SrcVT, MMO);

	if (useSSE) {
	Chain = Result.getValue(1);
	SDValue InFlag = Result.getValue(2);

	// FIXME: Currently the FST is flagged to the FILD_FLAG. This
	// shouldn't be necessary except that RFP cannot be live across
	// multiple blocks. When stackifier is fixed, they can be uncoupled.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned SSFISize = Op.getValueSizeInBits()/8;
	int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
	auto PtrVT = getPointerTy(MF.getDataLayout());
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {
	Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
	};
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOStore, SSFISize, SSFISize);

	Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
	Ops, Op.getValueType(), MMO);
	Result = DAG.getLoad(
	Op.getValueType(), DL, Chain, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	}

	return Result;
	}

	/// 64-bit unsigned integer to double expansion.
	static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This algorithm is not obvious. Here it is what we're trying to output:
	/*
	movq %rax, %xmm0
	punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
	subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
	#ifdef __SSE3__
	haddpd %xmm0, %xmm0
	#else
	pshufd $0x4e, %xmm0, %xmm1
	addpd %xmm1, %xmm0
	#endif
	*/

	SDLoc dl(Op);
	LLVMContext *Context = DAG.getContext();

	// Build some magic constants.
	static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
	Constant C0 = ConstantDataVector::get(Context, CV0);
	auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);

	SmallVector<Constant*,2> CV1;
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4330000000000000ULL))));
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4530000000000000ULL))));
	Constant *C1 = ConstantVector::get(CV1);
	SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);

	// Load the 64-bit value into an XMM register.
	SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	Op.getOperand(0));
	SDValue CLod0 =
	DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue Unpck1 =
	getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

	SDValue CLod1 =
	DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
	SDValue Result;

	if (Subtarget.hasSSE3()) {
	// FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
	Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
	} else {
	SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
	SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
	Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
	DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
	}

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
	DAG.getIntPtrConstant(0, dl));
	}

	/// 32-bit unsigned integer to float expansion.
	static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	// FP constant to bias correct the final result.
	SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
	MVT::f64);

	// Load the 32-bit value into an XMM register.
	SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
	Op.getOperand(0));

	// Zero out the upper parts of the register.
	Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

	Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Load),
	DAG.getIntPtrConstant(0, dl));

	// Or the load with the bias.
	SDValue Or = DAG.getNode(
	ISD::OR, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
	Or =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

	// Subtract the bias.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

	// Handle final rounding.
	return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
	}

	static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget, SDLoc &DL) {
	if (Op.getSimpleValueType() != MVT::v2f64)
	return SDValue();

	SDValue N0 = Op.getOperand(0);
	assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");

	// Legalize to v4i32 type.
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));

	if (Subtarget.hasAVX512())
	return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

	// Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
	// but using v2i32 to v2f64 with X86ISD::CVTSI2P.
	SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
	SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);

	// Two to the power of half-word-size.
	SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);

	// Clear upper part of LO, lower HI.
	SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
	SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);

	SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
	fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
	SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);

	// Add the two halves.
	return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
	}

	static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// The algorithm is the following:
	// #ifdef __SSE4_1__
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	// #else
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	// #endif
	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	// return (float4) lo + fhi;

	// We shouldn't use it when unsafe-fp-math is enabled though: we might later
	// reassociate the two FADDs, and if we do that, the algorithm fails
	// spectacularly (PR24512).
	// FIXME: If we ever have some kind of Machine FMF, this should be marked
	// as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
	// there's also the MachineCombiner reassociations happening on Machine IR.
	if (DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	SDLoc DL(Op);
	SDValue V = Op->getOperand(0);
	MVT VecIntVT = V.getSimpleValueType();
	bool Is128 = VecIntVT == MVT::v4i32;
	MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
	// If we convert to something else than the supported type, e.g., to v4f64,
	// abort early.
	if (VecFloatVT != Op->getSimpleValueType(0))
	return SDValue();

	assert((VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) &&
	"Unsupported custom type");

	// In the #idef/#else code, we have in common:
	// - The vector of constants:
	// -- 0x4b000000
	// -- 0x53000000
	// - A shift:
	// -- v >> 16

	// Create the splat vector for 0x4b000000.
	SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
	// Create the splat vector for 0x53000000.
	SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

	// Create the right shift.
	SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
	SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

	SDValue Low, High;
	if (Subtarget.hasSSE41()) {
	MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
	SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
	// Low will be bitcasted right away, so do not bother bitcasting back to its
	// original type.
	Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
	VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
	SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
	// High will be bitcasted right away, so do not bother bitcasting back to
	// its original type.
	High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
	VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	} else {
	SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
	Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
	}

	// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
	SDValue VecCstFAdd = DAG.getConstantFP(
	APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);

	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue FHigh =
	DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
	// return (float4) lo + fhi;
	SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
	return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
	}

	static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = Op.getOperand(0);
	MVT SrcVT = N0.getSimpleValueType();
	SDLoc dl(Op);

	if (SrcVT == MVT::v2i1) {
	// For v2i1, we need to widen to v4i1 first.
	assert(Op.getValueType() == MVT::v2f64 && "Unexpected type");
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, N0,
	DAG.getUNDEF(MVT::v2i1));
	return DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v2f64,
	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0));
	}

	switch (SrcVT.SimpleTy) {
	default:
	llvm_unreachable("Custom UINT_TO_FP is not supported!");
	case MVT::v2i32:
	return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
	case MVT::v4i32:
	case MVT::v8i32:
	assert(!Subtarget.hasAVX512());
	return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
	}
	}

	SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (Op.getSimpleValueType().isVector())
	return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

	MVT SrcVT = N0.getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
	(SrcVT == MVT::i32 \|\| (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
	// Conversions from unsigned i32 to f32/f64 are legal,
	// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
	return Op;
	}

	if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
	if (SrcVT == MVT::i32 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
	if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
	return SDValue();

	// Make a 64-bit buffer, and use it to build an FILD.
	SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
	if (SrcVT == MVT::i32) {
	SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
	SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
	StackSlot, MachinePointerInfo());
	SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
	OffsetSlot, MachinePointerInfo());
	SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
	return Fild;
	}

	assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
	SDValue ValueToStore = Op.getOperand(0);
	if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo());
	// For i64 source, we need to add the appropriate power of 2 if the input
	// was negative. This is the same as the optimization in
	// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
	// we must be careful to do the computation in x87 extended precision, not
	// in SSE. (The generic code can't know it's OK to do this, or how to.)
	int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, 8, 8);

	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
	SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
	MVT::i64, MMO);

	APInt FF(32, 0x5F800000ULL);

	// Check whether the sign bit is set.
	SDValue SignSet = DAG.getSetCC(
	dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
	Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

	// Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
	SDValue FudgePtr = DAG.getConstantPool(
	ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);

	// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
	SDValue Zero = DAG.getIntPtrConstant(0, dl);
	SDValue Four = DAG.getIntPtrConstant(4, dl);
	SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
	FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

	// Load the value out, extending it from f32 to f80.
	// FIXME: Avoid the extend by constructing the right constant pool?
	SDValue Fudge = DAG.getExtLoad(
	ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
	/* Alignment = */ 4);
	// Extend everything to 80 bits to force it to be done on x87.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
	return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
	DAG.getIntPtrConstant(0, dl));
	}

	// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
	// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
	// just return an <SDValue(), SDValue()> pair.
	// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
	// to i16, i32 or i64, and we lower it to a legal sequence.
	// If lowered to the final integer result we return a <result, SDValue()> pair.
	// Otherwise we lower it to a sequence ending with a FIST, return a
	// <FIST, StackSlot> pair, and the caller is responsible for loading
	// the final integer result from StackSlot.
	std::pair<SDValue,SDValue>
	X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
	bool IsSigned, bool IsReplace) const {
	SDLoc DL(Op);

	EVT DstTy = Op.getValueType();
	EVT TheVT = Op.getOperand(0).getValueType();
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
	// f16 must be promoted before using the lowering in this routine.
	// fp128 does not use this lowering.
	return std::make_pair(SDValue(), SDValue());
	}

	// If using FIST to compute an unsigned i64, we'll need some fixup
	// to handle values above the maximum signed i64. A FIST is always
	// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
	bool UnsignedFixup = !IsSigned &&
	DstTy == MVT::i64 &&
	(!Subtarget.is64Bit() \|\|
	!isScalarFPTypeInSSEReg(TheVT));

	if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
	// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
	// The low 32 bits of the fist result will have the correct uint32 result.
	assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
	DstTy = MVT::i64;
	}

	assert(DstTy.getSimpleVT() <= MVT::i64 &&
	DstTy.getSimpleVT() >= MVT::i16 &&
	"Unknown FP_TO_INT to lower!");

	// These are really Legal.
	if (DstTy == MVT::i32 &&
	isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
	return std::make_pair(SDValue(), SDValue());
	if (Subtarget.is64Bit() &&
	DstTy == MVT::i64 &&
	isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
	return std::make_pair(SDValue(), SDValue());

	// We lower FP->int64 into FISTP64 followed by a load from a temporary
	// stack slot.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned MemSize = DstTy.getSizeInBits()/8;
	int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

	unsigned Opc;
	switch (DstTy.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
	case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
	case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
	case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
	}

	SDValue Chain = DAG.getEntryNode();
	SDValue Value = Op.getOperand(0);
	SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

	if (UnsignedFixup) {
	//
	// Conversion to unsigned i64 is implemented with a select,
	// depending on whether the source value fits in the range
	// of a signed i64. Let Thresh be the FP equivalent of
	// 0x8000000000000000ULL.
	//
	// Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
	// FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
	// Fist-to-mem64 FistSrc
	// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
	// to XOR'ing the high 32 bits with Adjust.
	//
	// Being a power of 2, Thresh is exactly representable in all FP formats.
	// For X87 we'd like to use the smallest FP type for this constant, but
	// for DAG type consistency we have to match the FP operand type.

	APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
	LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
	bool LosesInfo = false;
	if (TheVT == MVT::f64)
	// The rounding mode is irrelevant as the conversion should be exact.
	Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
	&LosesInfo);
	else if (TheVT == MVT::f80)
	Status = Thresh.convert(APFloat::x87DoubleExtended(),
	APFloat::rmNearestTiesToEven, &LosesInfo);

	assert(Status == APFloat::opOK && !LosesInfo &&
	"FP conversion should have been exact");

	SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

	SDValue Cmp = DAG.getSetCC(DL,
	getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
	DAG.getConstant(0, DL, MVT::i32),
	DAG.getConstant(0x80000000, DL, MVT::i32));
	SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
	Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
	}

	// FIXME This causes a redundant load/store if the SSE-class value is already
	// in memory, such as if it is on the callstack.
	if (isScalarFPTypeInSSEReg(TheVT)) {
	assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
	Chain = DAG.getStore(Chain, DL, Value, StackSlot,
	MachinePointerInfo::getFixedStack(MF, SSFI));
	SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
	SDValue Ops[] = {
	Chain, StackSlot, DAG.getValueType(TheVT)
	};

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOLoad, MemSize, MemSize);
	Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
	Chain = Value.getValue(1);
	SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
	StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	}

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOStore, MemSize, MemSize);

	if (UnsignedFixup) {

	// Insert the FIST, load its result as two i32's,
	// and XOR the high i32 with Adjust.

	SDValue FistOps[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
	FistOps, DstTy, MMO);

	SDValue Low32 =
	DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
	SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);

	SDValue High32 =
	DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
	High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);

	if (Subtarget.is64Bit()) {
	// Join High32 and Low32 into a 64-bit result.
	// (High32 << 32) \| Low32
	Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
	High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
	High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
	DAG.getConstant(32, DL, MVT::i8));
	SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
	return std::make_pair(Result, SDValue());
	}

	SDValue ResultOps[] = { Low32, High32 };

	SDValue pair = IsReplace
	? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
	: DAG.getMergeValues(ResultOps, DL);
	return std::make_pair(pair, SDValue());
	} else {
	// Build the FP_TO_INT*_IN_MEM
	SDValue Ops[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
	Ops, DstTy, MMO);
	return std::make_pair(FIST, StackSlot);
	}
	}

	static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if ((VT != MVT::v4i64 \|\| InVT != MVT::v4i32) &&
	(VT != MVT::v8i32 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i16 \|\| InVT != MVT::v16i8) &&
	(VT != MVT::v8i64 \|\| InVT != MVT::v8i32) &&
	(VT != MVT::v8i64 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i32 \|\| InVT != MVT::v16i16) &&
	(VT != MVT::v16i32 \|\| InVT != MVT::v16i8) &&
	(VT != MVT::v32i16 \|\| InVT != MVT::v32i8))
	return SDValue();

	if (Subtarget.hasInt256())
	return DAG.getNode(X86ISD::VZEXT, dl, VT, In);

	// Optimize vectors in AVX mode:
	//
	// v8i16 -> v8i32
	// Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
	// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
	// Concat upper and lower parts.
	//
	// v4i32 -> v4i64
	// Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
	// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
	// Concat upper and lower parts.
	//

	SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
	SDValue Undef = DAG.getUNDEF(InVT);
	bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
	SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
	SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

	MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements()/2);

	OpLo = DAG.getBitcast(HVT, OpLo);
	OpHi = DAG.getBitcast(HVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	SDLoc DL(Op);
	unsigned NumElts = VT.getVectorNumElements();

	// Extend VT if the scalar type is v8/v16 and BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI() &&
	(VT.getVectorElementType().getSizeInBits() <= 16))
	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, DL));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
	NumElts);
	}

	SDValue One = DAG.getConstant(1, DL, WideVT);
	SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);

	SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

	// Truncate if we had to extend i16/i8 above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
	DAG.getIntPtrConstant(0, DL));

	return SelectedVal;
	}

	static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	if (SVT.getVectorElementType() == MVT::i1)
	return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

	if (Subtarget.hasFp256())
	if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
	return Res;

	assert(!Op.getSimpleValueType().is256BitVector() \|\| !SVT.is128BitVector() \|\|
	Op.getSimpleValueType().getVectorNumElements() !=
	SVT.getVectorNumElements());
	return SDValue();
	}

	/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
	/// It makes use of the fact that vectors with enough leading sign/zero bits
	/// prevent the PACKSS/PACKUS from saturating the results.
	/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
	/// within each 128-bit lane.
	static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Opcode == X86ISD::PACKSS \|\| Opcode == X86ISD::PACKUS) &&
	"Unexpected PACK opcode");

	// Requires SSE2 but AVX512 has fast truncate.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	EVT SrcVT = In.getValueType();

	// No truncation required, we might get here due to recursive calls.
	if (SrcVT == DstVT)
	return In;

	// We only support vector truncation to 128bits or greater from a
	// 256bits or greater source.
	unsigned DstSizeInBits = DstVT.getSizeInBits();
	unsigned SrcSizeInBits = SrcVT.getSizeInBits();
	if ((DstSizeInBits % 128) != 0 \|\| (SrcSizeInBits % 256) != 0)
	return SDValue();

	LLVMContext &Ctx = *DAG.getContext();
	unsigned NumElems = SrcVT.getVectorNumElements();
	assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
	assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");

	EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

	// Extract lower/upper subvectors.
	unsigned NumSubElts = NumElems / 2;
	SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
	SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);

	// Pack to the largest type possible:
	// vXi64/vXi32 -> PACKSDW and vXi16 -> PACKSWB.
	EVT InVT = MVT::i16, OutVT = MVT::i8;
	if (DstVT.getScalarSizeInBits() > 8 &&
	(Opcode == X86ISD::PACKSS \|\| Subtarget.hasSSE41())) {
	InVT = MVT::i32;
	OutVT = MVT::i16;
	}

	unsigned SubSizeInBits = SrcSizeInBits / 2;
	InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
	OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

	// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
	if (SrcVT.is256BitVector()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
	return DAG.getBitcast(DstVT, Res);
	}

	// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
	// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
	if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

	// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
	// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
	Res = DAG.getBitcast(MVT::v4i64, Res);
	Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});

	if (DstVT.is256BitVector())
	return DAG.getBitcast(DstVT, Res);

	// If 512bit -> 128bit truncate another stage.
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	Res = DAG.getBitcast(PackedVT, Res);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	// Recursively pack lower/upper subvectors, concat result and pack again.
	assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
	Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
	Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

	PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();

	assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");

	// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
	unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
	if (InVT.getScalarSizeInBits() <= 16) {
	if (Subtarget.hasBWI()) {
	// legal, will go to VPMOVB2M, VPMOVW2M
	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	// Shift packed bytes not supported natively, bitcast to word
	MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
	In = DAG.getNode(ISD::SHL, DL, ExtVT,
	DAG.getBitcast(ExtVT, In),
	DAG.getConstant(ShiftInx, DL, ExtVT));
	In = DAG.getBitcast(InVT, In);
	}
	return DAG.getNode(X86ISD::CVT2MASK, DL, VT, In);
	}
	// Use TESTD/Q, extended vector to packed dword/qword.
	assert((InVT.is256BitVector() \|\| InVT.is128BitVector()) &&
	"Unexpected vector type.");
	unsigned NumElts = InVT.getVectorNumElements();
	MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
	MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
	In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
	InVT = ExtVT;
	ShiftInx = InVT.getScalarSizeInBits() - 1;
	}

	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	In = DAG.getNode(ISD::SHL, DL, InVT, In,
	DAG.getConstant(ShiftInx, DL, InVT));
	}
	return DAG.getNode(X86ISD::TESTM, DL, VT, In, In);
	}

	SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();
	unsigned InNumEltBits = InVT.getScalarSizeInBits();

	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Invalid TRUNCATE operation");

	if (VT.getVectorElementType() == MVT::i1)
	return LowerTruncateVecI1(Op, DAG, Subtarget);

	// vpmovqb/w/d, vpmovdb/w, vpmovwb
	if (Subtarget.hasAVX512()) {
	// word to byte only under BWI
	if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
	return DAG.getNode(X86ISD::VTRUNC, DL, VT,
	getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
	return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
	}

	// Truncate with PACKSS if we are truncating a vector with sign-bits that
	// extend all the way to the packed/truncated value.
	unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
	if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
	return V;

	// Truncate with PACKUS if we are truncating a vector with leading zero bits
	// that extend all the way to the packed/truncated value.
	// Pre-SSE41 we can only use PACKUSWB.
	KnownBits Known;
	DAG.computeKnownBits(In, Known);
	NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
	if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
	return V;

	if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
	// On AVX2, v4i64 -> v4i32 becomes VPERMD.
	if (Subtarget.hasInt256()) {
	static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
	In = DAG.getBitcast(MVT::v8i32, In);
	In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(2, DL));
	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
	static const int ShufMask[] = {0, 2, 4, 6};
	return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
	}

	if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
	// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
	if (Subtarget.hasInt256()) {
	In = DAG.getBitcast(MVT::v32i8, In);

	// The PSHUFB mask:
	static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1,
	16, 17, 20, 21, 24, 25, 28, 29,
	-1, -1, -1, -1, -1, -1, -1, -1 };
	In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
	In = DAG.getBitcast(MVT::v4i64, In);

	static const int ShufMask2[] = {0, 2, -1, -1};
	In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getBitcast(VT, In);
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(0, DL));

	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(4, DL));

	OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
	OpHi = DAG.getBitcast(MVT::v16i8, OpHi);

	// The PSHUFB mask:
	static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1};

	OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
	OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);

	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

	// The MOVLHPS Mask:
	static const int ShufMask2[] = {0, 1, 4, 5};
	SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
	return DAG.getBitcast(MVT::v8i16, res);
	}

	// Handle truncation of V256 to V128 using shuffles.
	if (!VT.is128BitVector() \|\| !InVT.is256BitVector())
	return SDValue();

	assert(Subtarget.hasFp256() && "256-bit vector without AVX!");

	unsigned NumElems = VT.getVectorNumElements();
	MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);

	SmallVector<int, 16> MaskVec(NumElems * 2, -1);
	// Prepare truncation shuffle mask
	for (unsigned i = 0; i != NumElems; ++i)
	MaskVec[i] = i * 2;
	In = DAG.getBitcast(NVT, In);
	SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
	MVT VT = Op.getSimpleValueType();

	if (VT.isVector()) {
	SDValue Src = Op.getOperand(0);
	SDLoc dl(Op);

	if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
	MVT ResVT = MVT::v4i32;
	MVT TruncVT = MVT::v4i1;
	unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	if (!IsSigned && !Subtarget.hasVLX()) {
	// Widen to 512-bits.
	ResVT = MVT::v8i32;
	TruncVT = MVT::v8i1;
	Opc = ISD::FP_TO_UINT;
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
	DAG.getUNDEF(MVT::v8f64),
	Src, DAG.getIntPtrConstant(0, dl));
	}
	SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
	Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
	return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32)));
	}

	return SDValue();
	}

	assert(!VT.isVector());

	std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
	IsSigned, /IsReplace=/ false);
	SDValue FIST = Vals.first, StackSlot = Vals.second;
	// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
	if (!FIST.getNode())
	return Op;

	if (StackSlot.getNode())
	// Load the result.
	return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());

	// The node is the result.
	return FIST;
	}

	static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");

	return DAG.getNode(X86ISD::VFPEXT, DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
	In, DAG.getUNDEF(SVT)));
	}

	/// The only differences between FABS and FNEG are the mask and the logic op.
	/// FNEG also has a folding opportunity for FNEG(FABS(x)).
	static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
	assert((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) &&
	"Wrong opcode for lowering FABS or FNEG.");

	bool IsFABS = (Op.getOpcode() == ISD::FABS);

	// If this is a FABS and it has an FNEG user, bail out to fold the combination
	// into an FNABS. We'll lower the FABS after that if it is still in use.
	if (IsFABS)
	for (SDNode *User : Op->uses())
	if (User->getOpcode() == ISD::FNEG)
	return Op;

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	bool IsF128 = (VT == MVT::f128);

	// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
	// decide if we should generate a 16-byte constant mask when we only need 4 or
	// 8 bytes for the scalar case.

	MVT LogicVT;
	MVT EltVT;

	if (VT.isVector()) {
	LogicVT = VT;
	EltVT = VT.getVectorElementType();
	} else if (IsF128) {
	// SSE instructions are used for optimized f128 logical operations.
	LogicVT = MVT::f128;
	EltVT = VT;
	} else {
	// There are no scalar bitwise logical SSE/AVX instructions, so we
	// generate a 16-byte vector constant and logic op even for the scalar case.
	// Using a 16-byte mask allows folding the load of the mask with
	// the logic op, so it can save (~4 bytes) on code size.
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
	EltVT = VT;
	}

	unsigned EltBits = EltVT.getSizeInBits();
	// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
	APInt MaskElt =
	IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
	const fltSemantics &Sem =
	EltVT == MVT::f64 ? APFloat::IEEEdouble() :
	(IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
	SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

	SDValue Op0 = Op.getOperand(0);
	bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
	unsigned LogicOp =
	IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
	SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

	if (VT.isVector() \|\| IsF128)
	return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

	// For the scalar case extend to a 128-bit vector, perform the logic op,
	// and extract the scalar result back out.
	Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
	SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue Mag = Op.getOperand(0);
	SDValue Sign = Op.getOperand(1);
	SDLoc dl(Op);

	// If the sign operand is smaller, extend it first.
	MVT VT = Op.getSimpleValueType();
	if (Sign.getSimpleValueType().bitsLT(VT))
	Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

	// And if it is bigger, shrink it first.
	if (Sign.getSimpleValueType().bitsGT(VT))
	Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));

	// At this point the operands and the result should have the same
	// type, and that won't be f80 since that is not custom lowered.
	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFCOPYSIGN");

	MVT EltVT = VT.getScalarType();
	const fltSemantics &Sem =
	EltVT == MVT::f64 ? APFloat::IEEEdouble()
	: (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());

	// Perform all scalar logic operations as 16-byte vectors because there are no
	// scalar FP logic instructions in SSE.
	// TODO: This isn't necessary. If we used scalar types, we might avoid some
	// unnecessary splats, but we might miss load folding opportunities. Should
	// this decision be based on OptimizeForSize?
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	// The mask constants are automatically splatted for vector types.
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue SignMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
	SDValue MagMask = DAG.getConstantFP(
	APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

	// First, clear all bits but the sign bit from the second operand (sign).
	if (IsFakeVector)
	Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
	SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

	// Next, clear the sign bit from the first operand (magnitude).
	// TODO: If we had general constant folding for FP logic ops, this check
	// wouldn't be necessary.
	SDValue MagBits;
	if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
	APFloat APF = Op0CN->getValueAPF();
	APF.clearSign();
	MagBits = DAG.getConstantFP(APF, dl, LogicVT);
	} else {
	// If the magnitude operand wasn't a constant, we need to AND out the sign.
	if (IsFakeVector)
	Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
	MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
	}

	// OR the magnitude value with the sign bit.
	SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
	return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	MVT OpVT = N0.getSimpleValueType();
	assert((OpVT == MVT::f32 \|\| OpVT == MVT::f64) &&
	"Unexpected type for FGETSIGN");

	// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
	MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
	Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
	Res = DAG.getZExtOrTrunc(Res, dl, VT);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
	return Res;
	}

	// Check whether an OR'd tree is PTEST-able.
	static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");

	if (!Subtarget.hasSSE41())
	return SDValue();

	if (!Op->hasOneUse())
	return SDValue();

	SDNode *N = Op.getNode();
	SDLoc DL(N);

	SmallVector<SDValue, 8> Opnds;
	DenseMap<SDValue, unsigned> VecInMap;
	SmallVector<SDValue, 8> VecIns;
	EVT VT = MVT::Other;

	// Recognize a special case where a vector is casted into wide integer to
	// test all 0s.
	Opnds.push_back(N->getOperand(0));
	Opnds.push_back(N->getOperand(1));

	for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
	SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
	// BFS traverse all OR'd operands.
	if (I->getOpcode() == ISD::OR) {
	Opnds.push_back(I->getOperand(0));
	Opnds.push_back(I->getOperand(1));
	// Re-evaluate the number of nodes to be traversed.
	e += 2; // 2 more nodes (LHS and RHS) are pushed.
	continue;
	}

	// Quit if a non-EXTRACT_VECTOR_ELT
	if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// Quit if without a constant index.
	SDValue Idx = I->getOperand(1);
	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	SDValue ExtractedFromVec = I->getOperand(0);
	DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
	if (M == VecInMap.end()) {
	VT = ExtractedFromVec.getValueType();
	// Quit if not 128/256-bit vector.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();
	// Quit if not the same type.
	if (VecInMap.begin() != VecInMap.end() &&
	VT != VecInMap.begin()->first.getValueType())
	return SDValue();
	M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
	VecIns.push_back(ExtractedFromVec);
	}
	M->second \|= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
	}

	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Not extracted from 128-/256-bit vector.");

	unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;

	for (DenseMap<SDValue, unsigned>::const_iterator
	I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
	// Quit if not all elements are used.
	if (I->second != FullMask)
	return SDValue();
	}

	MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

	// Cast all vectors into TestVT for PTEST.
	for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
	VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);

	// If more than one full vector is evaluated, OR them first before PTEST.
	for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
	// Each iteration will OR 2 nodes and append the result until there is only
	// 1 node left, i.e. the final OR'd value of all vectors.
	SDValue LHS = VecIns[Slot];
	SDValue RHS = VecIns[Slot + 1];
	VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
	}

	return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
	}

	/// \brief return true if \c Op has a use that doesn't just read flags.
	static bool hasNonFlagsUse(SDValue Op) {
	for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
	++UI) {
	SDNode User = UI;
	unsigned UOpNo = UI.getOperandNo();
	if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
	// Look pass truncate.
	UOpNo = User->use_begin().getOperandNo();
	User = *User->use_begin();
	}

	if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
	!(User->getOpcode() == ISD::SELECT && UOpNo == 0))
	return true;
	}
	return false;
	}

	// Emit KTEST instruction for bit vectors on AVX-512
	static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Op.getOpcode() == ISD::BITCAST) {
	auto hasKTEST = [&](MVT VT) {
	unsigned SizeInBits = VT.getSizeInBits();
	return (Subtarget.hasDQI() && (SizeInBits == 8 \|\| SizeInBits == 16)) \|\|
	(Subtarget.hasBWI() && (SizeInBits == 32 \|\| SizeInBits == 64));
	};
	SDValue Op0 = Op.getOperand(0);
	MVT Op0VT = Op0.getValueType().getSimpleVT();
	if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
	hasKTEST(Op0VT))
	return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
	}
	return SDValue();
	}

	/// Emit nodes that will be selected as "test Op0,Op0", or something
	/// equivalent.
	SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG) const {
	if (Op.getValueType() == MVT::i1) {
	SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
	DAG.getConstant(0, dl, MVT::i8));
	}
	// CF and OF aren't always set the way we want. Determine which
	// of these we need.
	bool NeedCF = false;
	bool NeedOF = false;
	switch (X86CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	NeedCF = true;
	break;
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	case X86::COND_O: case X86::COND_NO: {
	// Check if we really need to set the
	// Overflow flag. If NoSignedWrap is present
	// that is not actually needed.
	switch (Op->getOpcode()) {
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::SHL:
	if (Op.getNode()->getFlags().hasNoSignedWrap())
	break;
	LLVM_FALLTHROUGH;
	default:
	NeedOF = true;
	break;
	}
	break;
	}
	}
	// See if we can use the EFLAGS value from the operand instead of
	// doing a separate TEST. TEST always sets OF and CF to 0, so unless
	// we prove that the arithmetic won't overflow, we can't use OF or CF.
	if (Op.getResNo() != 0 \|\| NeedOF \|\| NeedCF) {
	// Emit KTEST for bit vectors
	if (auto Node = EmitKTEST(Op, DAG, Subtarget))
	return Node;
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	unsigned Opcode = 0;
	unsigned NumOperands = 0;

	// Truncate operations may prevent the merge of the SETCC instruction
	// and the arithmetic instruction before it. Attempt to truncate the operands
	// of the arithmetic instruction and use a reduced bit-width instruction.
	bool NeedTruncation = false;
	SDValue ArithOp = Op;
	if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
	SDValue Arith = Op->getOperand(0);
	// Both the trunc and the arithmetic op need to have one user each.
	if (Arith->hasOneUse())
	switch (Arith.getOpcode()) {
	default: break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR: {
	NeedTruncation = true;
	ArithOp = Arith;
	}
	}
	}

	// Sometimes flags can be set either with an AND or with an SRL/SHL
	// instruction. SRL/SHL variant should be preferred for masks longer than this
	// number of bits.
	const int ShiftToAndMaxMaskWidth = 32;
	const bool ZeroCheck = (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE);

	// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
	// which may be the result of a CAST. We use the variable 'Op', which is the
	// non-casted variable when we check for possible users.
	switch (ArithOp.getOpcode()) {
	case ISD::ADD:
	// We only want to rewrite this as a target-specific node with attached
	// flags if there is a reasonable chance of either using that to do custom
	// instructions selection that can fold some of the memory operands, or if
	// only the flags are used. If there are other uses, leave the node alone
	// and emit a test instruction.
	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
	UE = Op.getNode()->use_end(); UI != UE; ++UI)
	if (UI->getOpcode() != ISD::CopyToReg &&
	UI->getOpcode() != ISD::SETCC &&
	UI->getOpcode() != ISD::STORE)
	goto default_case;

	if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
	// An add of one will be selected as an INC.
	if (C->isOne() &&
	(!Subtarget.slowIncDec() \|\|
	DAG.getMachineFunction().getFunction().optForSize())) {
	Opcode = X86ISD::INC;
	NumOperands = 1;
	break;
	}

	// An add of negative one (subtract of one) will be selected as a DEC.
	if (C->isAllOnesValue() &&
	(!Subtarget.slowIncDec() \|\|
	DAG.getMachineFunction().getFunction().optForSize())) {
	Opcode = X86ISD::DEC;
	NumOperands = 1;
	break;
	}
	}

	// Otherwise use a regular EFLAGS-setting add.
	Opcode = X86ISD::ADD;
	NumOperands = 2;
	break;
	case ISD::SHL:
	case ISD::SRL:
	// If we have a constant logical shift that's only used in a comparison
	// against zero turn it into an equivalent AND. This allows turning it into
	// a TEST instruction later.
	if (ZeroCheck && Op->hasOneUse() &&
	isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
	EVT VT = Op.getValueType();
	unsigned BitWidth = VT.getSizeInBits();
	unsigned ShAmt = Op->getConstantOperandVal(1);
	if (ShAmt >= BitWidth) // Avoid undefined shifts.
	break;
	APInt Mask = ArithOp.getOpcode() == ISD::SRL
	? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
	: APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
	if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
	break;
	Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
	DAG.getConstant(Mask, dl, VT));
	}
	break;

	case ISD::AND:
	// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
	// because a TEST instruction will be better. However, AND should be
	// preferred if the instruction can be combined into ANDN.
	if (!hasNonFlagsUse(Op)) {
	SDValue Op0 = ArithOp->getOperand(0);
	SDValue Op1 = ArithOp->getOperand(1);
	EVT VT = ArithOp.getValueType();
	bool isAndn = isBitwiseNot(Op0) \|\| isBitwiseNot(Op1);
	bool isLegalAndnType = VT == MVT::i32 \|\| VT == MVT::i64;
	bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();

	// If we cannot select an ANDN instruction, check if we can replace
	// AND+IMM64 with a shift before giving up. This is possible for masks
	// like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
	if (!isProperAndn) {
	if (!ZeroCheck)
	break;

	assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
	auto *CN = dyn_cast<ConstantSDNode>(Op1);
	if (!CN)
	break;

	const APInt &Mask = CN->getAPIntValue();
	if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
	break; // Prefer TEST instruction.

	unsigned BitWidth = Mask.getBitWidth();
	unsigned LeadingOnes = Mask.countLeadingOnes();
	unsigned TrailingZeros = Mask.countTrailingZeros();

	if (LeadingOnes + TrailingZeros == BitWidth) {
	assert(TrailingZeros < VT.getSizeInBits() &&
	"Shift amount should be less than the type width");
	MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
	Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
	break;
	}

	unsigned LeadingZeros = Mask.countLeadingZeros();
	unsigned TrailingOnes = Mask.countTrailingOnes();

	if (LeadingZeros + TrailingOnes == BitWidth) {
	assert(LeadingZeros < VT.getSizeInBits() &&
	"Shift amount should be less than the type width");
	MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
	Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
	break;
	}

	break;
	}
	}
	LLVM_FALLTHROUGH;
	case ISD::SUB:
	case ISD::OR:
	case ISD::XOR:
	// Similar to ISD::ADD above, check if the uses will preclude useful
	// lowering of the target-specific node.
	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
	UE = Op.getNode()->use_end(); UI != UE; ++UI)
	if (UI->getOpcode() != ISD::CopyToReg &&
	UI->getOpcode() != ISD::SETCC &&
	UI->getOpcode() != ISD::STORE)
	goto default_case;

	// Otherwise use a regular EFLAGS-setting instruction.
	switch (ArithOp.getOpcode()) {
	default: llvm_unreachable("unexpected operator!");
	case ISD::SUB: Opcode = X86ISD::SUB; break;
	case ISD::XOR: Opcode = X86ISD::XOR; break;
	case ISD::AND: Opcode = X86ISD::AND; break;
	case ISD::OR: {
	if (!NeedTruncation && ZeroCheck) {
	if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
	return EFLAGS;
	}
	Opcode = X86ISD::OR;
	break;
	}
	}

	NumOperands = 2;
	break;
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::INC:
	case X86ISD::DEC:
	case X86ISD::OR:
	case X86ISD::XOR:
	case X86ISD::AND:
	return SDValue(Op.getNode(), 1);
	default:
	default_case:
	break;
	}

	// If we found that truncation is beneficial, perform the truncation and
	// update 'Op'.
	if (NeedTruncation) {
	EVT VT = Op.getValueType();
	SDValue WideVal = Op->getOperand(0);
	EVT WideVT = WideVal.getValueType();
	unsigned ConvertedOp = 0;
	// Use a target machine opcode to prevent further DAGCombine
	// optimizations that may separate the arithmetic operations
	// from the setcc node.
	switch (WideVal.getOpcode()) {
	default: break;
	case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
	case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
	case ISD::AND: ConvertedOp = X86ISD::AND; break;
	case ISD::OR: ConvertedOp = X86ISD::OR; break;
	case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
	}

	if (ConvertedOp) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
	SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
	SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
	Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
	}
	}
	}

	if (Opcode == 0) {
	// Emit KTEST for bit vectors
	if (auto Node = EmitKTEST(Op, DAG, Subtarget))
	return Node;

	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

	SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
	DAG.ReplaceAllUsesWith(Op, New);
	return SDValue(New.getNode(), 1);
	}

	/// Emit nodes that will be selected as "cmp Op0,Op1", or something
	/// equivalent.
	SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
	const SDLoc &dl, SelectionDAG &DAG) const {
	if (isNullConstant(Op1))
	return EmitTest(Op0, X86CC, dl, DAG);

	assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
	"Unexpected comparison operation for MVT::i1 operands");

	if ((Op0.getValueType() == MVT::i8 \|\| Op0.getValueType() == MVT::i16 \|\|
	Op0.getValueType() == MVT::i32 \|\| Op0.getValueType() == MVT::i64)) {
	// Only promote the compare up to I32 if it is a 16 bit operation
	// with an immediate. 16 bit immediates are to be avoided.
	if ((Op0.getValueType() == MVT::i16 &&
	(isa<ConstantSDNode>(Op0) \|\| isa<ConstantSDNode>(Op1))) &&
	!DAG.getMachineFunction().getFunction().optForMinSize() &&
	!Subtarget.isAtom()) {
	unsigned ExtendOp =
	isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
	Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
	}
	// Use SUB instead of CMP to enable CSE between SUB and CMP.
	SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
	SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
	return SDValue(Sub.getNode(), 1);
	}
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
	}

	/// Convert a comparison if required by the subtarget.
	SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
	SelectionDAG &DAG) const {
	// If the subtarget does not support the FUCOMI instruction, floating-point
	// comparisons have to be converted.
	if (Subtarget.hasCMov() \|\|
	Cmp.getOpcode() != X86ISD::CMP \|\|
	!Cmp.getOperand(0).getValueType().isFloatingPoint() \|\|
	!Cmp.getOperand(1).getValueType().isFloatingPoint())
	return Cmp;

	// The instruction selector will select an FUCOM instruction instead of
	// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
	// build an SDNode sequence that transfers the result from FPSW into EFLAGS:
	// (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
	SDLoc dl(Cmp);
	SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
	SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
	SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
	DAG.getConstant(8, dl, MVT::i8));
	SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);

	// Some 64-bit targets lack SAHF support, but they do support FCOMI.
	assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
	return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
	}

	/// Check if replacement of SQRT with RSQRT should be disabled.
	bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// We never want to use both SQRT and RSQRT instructions for the same input.
	if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
	return false;

	if (VT.isVector())
	return Subtarget.hasFastVectorFSQRT();
	return Subtarget.hasFastScalarFSQRT();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
	SelectionDAG &DAG, int Enabled,
	int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Op.getValueType();

	// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
	// TODO: Add support for AVX512 (v16f32).
	// It is likely not profitable to do this for f64 because a double-precision
	// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
	// instructions: convert to single, rsqrtss, convert back to double, refine
	// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.
	// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
	// after legalize types.
	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	UseOneConstNR = false;
	return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
	int Enabled,
	int &RefinementSteps) const {
	EVT VT = Op.getValueType();

	// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
	// TODO: Add support for AVX512 (v16f32).
	// It is likely not profitable to do this for f64 because a double-precision
	// reciprocal estimate with refinement on x86 prior to FMA requires
	// 15 instructions: convert to single, rcpss, convert back to double, refine
	// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.

	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX())) {
	// Enable estimate codegen with 1 refinement step for vector division.
	// Scalar division estimates are disabled because they break too much
	// real-world code. These defaults are intended to match GCC behavior.
	if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
	return SDValue();

	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// If we have at least two divisions that use the same divisor, convert to
	/// multiplication by a reciprocal. This may need to be adjusted for a given
	/// CPU if a division's cost is not at least twice the cost of a multiplication.
	/// This is because we still need one division to calculate the reciprocal and
	/// then we need two multiplies by that reciprocal as replacements for the
	/// original divisions.
	unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
	return 2;
	}

	/// Helper for creating a X86ISD::SETCC node.
	static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
	SelectionDAG &DAG) {
	return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
	DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
	}

	/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
	/// according to equal/not-equal condition code \p CC.
	static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
	// instruction. Since the shift amount is in-range-or-undefined, we know
	// that doing a bittest on the i32 value is ok. We extend to i32 because
	// the encoding for the i16 version is larger than the i32 version.
	// Also promote i16 to i32 for performance / code size reason.
	if (Src.getValueType() == MVT::i8 \|\| Src.getValueType() == MVT::i16)
	Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);

	// See if we can use the 32-bit instruction instead of the 64-bit one for a
	// shorter encoding. Since the former takes the modulo 32 of BitNo and the
	// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
	// known to be zero.
	if (Src.getValueType() == MVT::i64 &&
	DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
	Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);

	// If the operand types disagree, extend the shift amount to match. Since
	// BT ignores high bits (like shifts) we can use anyextend.
	if (Src.getValueType() != BitNo.getValueType())
	BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);

	SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
	X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
	return getSETCC(Cond, BT, dl , DAG);
	}

	/// Result of 'and' is compared against zero. Change to a BT node if possible.
	static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	assert(And.getOpcode() == ISD::AND && "Expected AND node!");
	SDValue Op0 = And.getOperand(0);
	SDValue Op1 = And.getOperand(1);
	if (Op0.getOpcode() == ISD::TRUNCATE)
	Op0 = Op0.getOperand(0);
	if (Op1.getOpcode() == ISD::TRUNCATE)
	Op1 = Op1.getOperand(0);

	SDValue LHS, RHS;
	if (Op1.getOpcode() == ISD::SHL)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() == ISD::SHL) {
	if (isOneConstant(Op0.getOperand(0))) {
	// If we looked past a truncate, check that it's only truncating away
	// known zeros.
	unsigned BitWidth = Op0.getValueSizeInBits();
	unsigned AndBitWidth = And.getValueSizeInBits();
	if (BitWidth > AndBitWidth) {
	KnownBits Known;
	DAG.computeKnownBits(Op0, Known);
	if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
	return SDValue();
	}
	LHS = Op1;
	RHS = Op0.getOperand(1);
	}
	} else if (Op1.getOpcode() == ISD::Constant) {
	ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
	uint64_t AndRHSVal = AndRHS->getZExtValue();
	SDValue AndLHS = Op0;

	if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
	LHS = AndLHS.getOperand(0);
	RHS = AndLHS.getOperand(1);
	}

	// Use BT if the immediate can't be encoded in a TEST instruction.
	if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
	LHS = AndLHS;
	RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
	}
	}

	if (LHS.getNode())
	return getBitTestCondition(LHS, RHS, CC, dl, DAG);

	return SDValue();
	}

	/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
	/// CMPs.
	static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
	SDValue &Op1) {
	unsigned SSECC;
	bool Swap = false;

	// SSE Condition code mapping:
	// 0 - EQ
	// 1 - LT
	// 2 - LE
	// 3 - UNORD
	// 4 - NEQ
	// 5 - NLT
	// 6 - NLE
	// 7 - ORD
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETOEQ:
	case ISD::SETEQ: SSECC = 0; break;
	case ISD::SETOGT:
	case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLT:
	case ISD::SETOLT: SSECC = 1; break;
	case ISD::SETOGE:
	case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLE:
	case ISD::SETOLE: SSECC = 2; break;
	case ISD::SETUO: SSECC = 3; break;
	case ISD::SETUNE:
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: SSECC = 5; break;
	case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGT: SSECC = 6; break;
	case ISD::SETO: SSECC = 7; break;
	case ISD::SETUEQ: SSECC = 8; break;
	case ISD::SETONE: SSECC = 12; break;
	}
	if (Swap)
	std::swap(Op0, Op1);

	return SSECC;
	}

	/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
	/// concatenate the result back.
	static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);
	SDValue CC = Op.getOperand(2);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	// Issue the operation on the smaller types and concatenate the result back
	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
	}

	static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Unexpected type for boolean compare operation");
	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
	SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
	DAG.getConstant(-1, dl, VT));
	SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
	DAG.getConstant(-1, dl, VT));
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETEQ:
	// (x == y) -> ~(x ^ y)
	return DAG.getNode(ISD::XOR, dl, VT,
	DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
	DAG.getConstant(-1, dl, VT));
	case ISD::SETNE:
	// (x != y) -> (x ^ y)
	return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
	case ISD::SETUGT:
	case ISD::SETGT:
	// (x > y) -> (x & ~y)
	return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
	case ISD::SETULT:
	case ISD::SETLT:
	// (x < y) -> (~x & y)
	return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
	case ISD::SETULE:
	case ISD::SETLE:
	// (x <= y) -> (~x \| y)
	return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
	case ISD::SETUGE:
	case ISD::SETGE:
	// (x >=y) -> (x \| ~y)
	return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
	}
	}

	static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(VT.getVectorElementType() == MVT::i1 &&
	"Cannot set masked compare for this operation");

	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
	unsigned Opc = 0;
	bool Unsigned = false;
	bool Swap = false;
	unsigned SSECC;
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
	case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
	case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
	case ISD::SETULT: SSECC = 1; Unsigned = true; break;
	case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
	case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
	case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
	case ISD::SETLE: SSECC = 2; break;
	}

	if (Swap)
	std::swap(Op0, Op1);

	// See if it is the case of CMP(EQ\|NEQ,AND(A,B),ZERO) and change it to TESTM\|NM.
	if ((!Opc && SSECC == 4) \|\| Opc == X86ISD::PCMPEQM) {
	SDValue A = peekThroughBitcasts(Op0);
	if ((A.getOpcode() == ISD::AND \|\| A.getOpcode() == X86ISD::FAND) &&
	ISD::isBuildVectorAllZeros(Op1.getNode())) {
	MVT VT0 = Op0.getSimpleValueType();
	SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
	SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
	return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
	dl, VT, RHS, LHS);
	}
	}

	if (Opc)
	return DAG.getNode(Opc, dl, VT, Op0, Op1);
	Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(SSECC, dl, MVT::i8));
	}

	/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
	/// operand \p Op1. If non-trivial (for example because it's not constant)
	/// return an empty value.
	static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
	SelectionDAG &DAG) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
	if (!BV)
	return SDValue();

	MVT VT = Op1.getSimpleValueType();
	MVT EVT = VT.getVectorElementType();
	unsigned n = VT.getVectorNumElements();
	SmallVector<SDValue, 8> ULTOp1;

	for (unsigned i = 0; i < n; ++i) {
	ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
	if (!Elt \|\| Elt->isOpaque() \|\| Elt->getSimpleValueType(0) != EVT)
	return SDValue();

	// Avoid underflow.
	APInt Val = Elt->getAPIntValue();
	if (Val == 0)
	return SDValue();

	ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
	}

	return DAG.getBuildVector(VT, dl, ULTOp1);
	}

	static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
	bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
	SDLoc dl(Op);

	if (isFP) {
	#ifndef NDEBUG
	MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
	assert(EltVT == MVT::f32 \|\| EltVT == MVT::f64);
	#endif

	unsigned Opc;
	if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
	assert(VT.getVectorNumElements() <= 16);
	Opc = X86ISD::CMPM;
	} else {
	Opc = X86ISD::CMPP;
	// The SSE/AVX packed FP comparison nodes are defined with a
	// floating-point vector result that matches the operand type. This allows
	// them to work with an SSE1 target (integer vector types are not legal).
	VT = Op0.getSimpleValueType();
	}

	// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
	// emit two comparisons and a logic op to tie them together.
	SDValue Cmp;
	unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
	if (SSECC >= 8 && !Subtarget.hasAVX()) {
	// LLVM predicate is SETUEQ or SETONE.
	unsigned CC0, CC1;
	unsigned CombineOpc;
	if (Cond == ISD::SETUEQ) {
	CC0 = 3; // UNORD
	CC1 = 0; // EQ
	CombineOpc = X86ISD::FOR;
	} else {
	assert(Cond == ISD::SETONE);
	CC0 = 7; // ORD
	CC1 = 4; // NEQ
	CombineOpc = X86ISD::FAND;
	}

	SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC0, dl, MVT::i8));
	SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC1, dl, MVT::i8));
	Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
	} else {
	// Handle all other FP comparisons here.
	Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(SSECC, dl, MVT::i8));
	}

	// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
	// result type of SETCC. The bitcast is expected to be optimized away
	// during combining/isel.
	if (Opc == X86ISD::CMPP)
	Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

	return Cmp;
	}

	MVT VTOp0 = Op0.getSimpleValueType();
	assert(VTOp0 == Op1.getSimpleValueType() &&
	"Expected operands with same type!");
	assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
	"Invalid number of packed elements for source and destination!");

	if (VT.is128BitVector() && VTOp0.is256BitVector()) {
	// On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
	// legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
	// legalizer firstly checks if the first operand in input to the setcc has
	// a legal type. If so, then it promotes the return type to that same type.
	// Otherwise, the return type is promoted to the 'next legal type' which,
	// for a vector of MVT::i1 is always a 128-bit integer vector type.
	//
	// We reach this code only if the following two conditions are met:
	// 1. Both return type and operand type have been promoted to wider types
	// by the type legalizer.
	// 2. The original operand type has been promoted to a 256-bit vector.
	//
	// Note that condition 2. only applies for AVX targets.
	SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
	return DAG.getZExtOrTrunc(NewOp, dl, VT);
	}

	// The non-AVX512 code below works under the assumption that source and
	// destination types are the same.
	assert((Subtarget.hasAVX512() \|\| (VT == VTOp0)) &&
	"Value types for source and destination must be the same!");

	// Break 256-bit integer vector compare into smaller ones.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntVSETCC(Op, DAG);

	// Operands are boolean (vectors of i1)
	MVT OpVT = Op1.getSimpleValueType();
	if (OpVT.getVectorElementType() == MVT::i1)
	return LowerBoolVSETCC_AVX512(Op, DAG);

	// The result is boolean, but operands are int/float
	if (VT.getVectorElementType() == MVT::i1) {
	// In AVX-512 architecture setcc returns mask with i1 elements,
	// But there is no compare instruction for i8 and i16 elements in KNL.
	// In this case use SSE compare
	bool UseAVX512Inst =
	(OpVT.is512BitVector() \|\|
	OpVT.getScalarSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX()));

	if (UseAVX512Inst)
	return LowerIntVSETCC_AVX512(Op, DAG);

	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
	}

	// Lower using XOP integer comparisons.
	if ((VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\|
	VT == MVT::v4i32 \|\| VT == MVT::v2i64) && Subtarget.hasXOP()) {
	// Translate compare code to XOP PCOM compare mode.
	unsigned CmpMode = 0;
	switch (Cond) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETULT:
	case ISD::SETLT: CmpMode = 0x00; break;
	case ISD::SETULE:
	case ISD::SETLE: CmpMode = 0x01; break;
	case ISD::SETUGT:
	case ISD::SETGT: CmpMode = 0x02; break;
	case ISD::SETUGE:
	case ISD::SETGE: CmpMode = 0x03; break;
	case ISD::SETEQ: CmpMode = 0x04; break;
	case ISD::SETNE: CmpMode = 0x05; break;
	}

	// Are we comparing unsigned or signed integers?
	unsigned Opc =
	ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CmpMode, dl, MVT::i8));
	}

	// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
	// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
	if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
	SDValue BC0 = peekThroughBitcasts(Op0);
	if (BC0.getOpcode() == ISD::AND) {
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (getTargetConstantBitsFromNode(BC0.getOperand(1),
	VT.getScalarSizeInBits(), UndefElts,
	EltBits, false, false)) {
	if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
	Cond = ISD::SETEQ;
	Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
	}
	}
	}
	}

	// We are handling one of the integer comparisons here. Since SSE only has
	// GT and EQ comparisons for integer, swapping operands and multiple
	// operations may be required for some comparisons.
	unsigned Opc = (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) ? X86ISD::PCMPEQ
	: X86ISD::PCMPGT;
	bool Swap = Cond == ISD::SETLT \|\| Cond == ISD::SETULT \|\|
	Cond == ISD::SETGE \|\| Cond == ISD::SETUGE;
	bool Invert = Cond == ISD::SETNE \|\|
	(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

	// If both operands are known non-negative, then an unsigned compare is the
	// same as a signed compare and there's no need to flip signbits.
	// TODO: We could check for more general simplifications here since we're
	// computing known bits.
	bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
	!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

	// Special case: Use min/max operations for SETULE/SETUGE
	MVT VET = VT.getVectorElementType();
	bool HasMinMax =
	(Subtarget.hasAVX512() && VET == MVT::i64) \|\|
	(Subtarget.hasSSE41() && (VET == MVT::i16 \|\| VET == MVT::i32)) \|\|
	(Subtarget.hasSSE2() && (VET == MVT::i8));
	bool MinMax = false;
	if (HasMinMax) {
	switch (Cond) {
	default: break;
	case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
	case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
	}

	if (MinMax)
	Swap = Invert = FlipSigns = false;
	}

	bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 \|\| VET == MVT::i16);
	bool Subus = false;
	if (!MinMax && HasSubus) {
	// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
	// Op0 u<= Op1:
	// t = psubus Op0, Op1
	// pcmpeq t, <0..0>
	switch (Cond) {
	default: break;
	case ISD::SETULT: {
	// If the comparison is against a constant we can turn this into a
	// setule. With psubus, setule does not require a swap. This is
	// beneficial because the constant in the register is no longer
	// destructed as the destination so it can be hoisted out of a loop.
	// Only do this pre-AVX since vpcmp* is no longer destructive.
	if (Subtarget.hasAVX())
	break;
	if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
	Op1 = ULEOp1;
	Subus = true; Invert = false; Swap = false;
	}
	break;
	}
	// Psubus is better than flip-sign because it requires no inversion.
	case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
	case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
	}

	if (Subus) {
	Opc = X86ISD::SUBUS;
	FlipSigns = false;
	}
	}

	if (Swap)
	std::swap(Op0, Op1);

	// Check that the operation in question is available (most are plain SSE2,
	// but PCMPGTQ and PCMPEQQ have different requirements).
	if (VT == MVT::v2i64) {
	if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
	assert(Subtarget.hasSSE2() && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations. The lower
	// compare is always unsigned.
	SDValue SB;
	if (FlipSigns) {
	SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
	} else {
	SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
	SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
	SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
	}
	Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
	Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);

	// Emulate PCMPGTQ with (hi1 > hi2) \| ((hi1 == hi2) & (lo1 > lo2))
	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

	// Create masks for only the low parts/high parts of the 64 bit integers.
	static const int MaskHi[] = { 1, 1, 3, 3 };
	static const int MaskLo[] = { 0, 0, 2, 2 };
	SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
	SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
	SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
	Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}

	if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
	// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
	// pcmpeqd + pshufd + pand.
	assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Do the compare.
	SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

	// Make sure the lower and upper halves are both all-ones.
	static const int Mask[] = { 1, 0, 3, 2 };
	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
	Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}
	}

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations.
	if (FlipSigns) {
	MVT EltVT = VT.getVectorElementType();
	SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
	VT);
	Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
	Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	if (MinMax)
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

	if (Subus)
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
	getZeroVector(VT, Subtarget, DAG, dl));

	return Result;
	}

	SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	MVT VT = Op.getSimpleValueType();

	if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

	assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDLoc dl(Op);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

	// Optimize to BT if possible.
	// Lower (X & (1 << N)) == 0 to BT(X, N).
	// Lower ((X >>u N) & 1) != 0 to BT(X, N).
	// Lower ((X >>s N) & 1) != 0 to BT(X, N).
	if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
	return NewSetCC;
	}

	// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
	// these.
	if ((isOneConstant(Op1) \|\| isNullConstant(Op1)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {

	// If the input is a setcc, then reuse the input setcc or use a new one with
	// the inverted condition.
	if (Op0.getOpcode() == X86ISD::SETCC) {
	X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
	bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
	if (!Invert)
	return Op0;

	CCode = X86::GetOppositeBranchCondition(CCode);
	return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
	}
	}

	bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
	X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
	if (X86CC == X86::COND_INVALID)
	return SDValue();

	SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
	EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
	return getSETCC(X86CC, EFLAGS, dl, DAG);
	}

	SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue Carry = Op.getOperand(2);
	SDValue Cond = Op.getOperand(3);
	SDLoc DL(Op);

	assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
	X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

	// Recreate the carry if needed.
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
	SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
	return getSETCC(CC, Cmp.getValue(1), DL, DAG);
	}

	/// Return true if opcode is a X86 logical comparison.
	static bool isX86LogicalCmp(SDValue Op) {
	unsigned Opc = Op.getOpcode();
	if (Opc == X86ISD::CMP \|\| Opc == X86ISD::COMI \|\| Opc == X86ISD::UCOMI \|\|
	Opc == X86ISD::SAHF)
	return true;
	if (Op.getResNo() == 1 &&
	(Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB \|\| Opc == X86ISD::ADC \|\|
	Opc == X86ISD::SBB \|\| Opc == X86ISD::SMUL \|\|
	Opc == X86ISD::INC \|\| Opc == X86ISD::DEC \|\| Opc == X86ISD::OR \|\|
	Opc == X86ISD::XOR \|\| Opc == X86ISD::AND))
	return true;

	if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
	return true;

	return false;
	}

	static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
	if (V.getOpcode() != ISD::TRUNCATE)
	return false;

	SDValue VOp0 = V.getOperand(0);
	unsigned InBits = VOp0.getValueSizeInBits();
	unsigned Bits = V.getValueSizeInBits();
	return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
	}

	SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	bool AddTest = true;
	SDValue Cond = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Op2 = Op.getOperand(2);
	SDLoc DL(Op);
	MVT VT = Op1.getSimpleValueType();
	SDValue CC;

	// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
	// are available or VBLENDV if AVX is available.
	// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
	if (Cond.getOpcode() == ISD::SETCC &&
	((Subtarget.hasSSE2() && (VT == MVT::f32 \|\| VT == MVT::f64)) \|\|
	(Subtarget.hasSSE1() && VT == MVT::f32)) &&
	VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
	SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
	unsigned SSECC = translateX86FSETCC(
	cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);

	if (Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
	CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
	assert(!VT.isVector() && "Not a scalar type?");
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	if (SSECC < 8 \|\| Subtarget.hasAVX()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
	DAG.getConstant(SSECC, DL, MVT::i8));

	// If we have AVX, we can use a variable vector select (VBLENDV) instead
	// of 3 logic instructions for size savings and potentially speed.
	// Unfortunately, there is no scalar form of VBLENDV.

	// If either operand is a constant, don't try this. We can expect to
	// optimize away at least one of the logic instructions later in that
	// case, so that sequence would be faster than a variable blend.

	// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
	// uses XMM0 as the selection register. That may need just as many
	// instructions as the AND/ANDN/OR sequence due to register moves, so
	// don't bother.

	if (Subtarget.hasAVX() &&
	!isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {

	// Convert to vectors, do a VSELECT, and convert back to scalar.
	// All of the conversions should be optimized away.

	MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
	SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
	SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
	SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

	MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
	VCmp = DAG.getBitcast(VCmpVT, VCmp);

	SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	VSel, DAG.getIntPtrConstant(0, DL));
	}
	SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
	SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
	return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
	}
	}

	// AVX512 fallback is to lower selects of scalar floats to masked moves.
	if ((VT == MVT::f64 \|\| VT == MVT::f32) && Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	// For v64i1 without 64-bit support we need to split and rejoin.
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	assert(Subtarget.hasBWI() && "Expected BWI to be legal");
	SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
	SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
	SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
	SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
	SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
	SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
	SDValue Op1Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
	Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
	else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
	Op1Scalar = Op1.getOperand(0);
	SDValue Op2Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
	Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
	else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
	Op2Scalar = Op2.getOperand(0);
	if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
	SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
	Op1Scalar, Op2Scalar);
	if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, newSelect);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
	DAG.getIntPtrConstant(0, DL));
	}
	}

	if (VT == MVT::v4i1 \|\| VT == MVT::v2i1) {
	SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
	Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
	Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
	SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
	}

	if (Cond.getOpcode() == ISD::SETCC) {
	if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
	Cond = NewCond;
	// If the condition was updated, it's possible that the operands of the
	// select were also updated (for example, EmitTest has a RAUW). Refresh
	// the local references to the select operands in case they got stale.
	Op1 = Op.getOperand(1);
	Op2 = Op.getOperand(2);
	}
	}

	// (select (x == 0), -1, y) -> (sign_bit (x - 1)) \| y
	// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) \| y
	// (select (x != 0), y, -1) -> (sign_bit (x - 1)) \| y
	// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) \| y
	// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
	// (select (and (x , 0x1) == 0), y, (z \| y) ) -> (-(and (x , 0x1)) & z ) \| y
	if (Cond.getOpcode() == X86ISD::SETCC &&
	Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(Cond.getOperand(1).getOperand(1))) {
	SDValue Cmp = Cond.getOperand(1);
	unsigned CondCode =
	cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();

	if ((isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(CondCode == X86::COND_E \|\| CondCode == X86::COND_NE)) {
	SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
	SDValue CmpOp0 = Cmp.getOperand(0);

	// Apply further optimizations for special cases
	// (select (x != 0), -1, 0) -> neg & sbb
	// (select (x == 0), 0, -1) -> neg & sbb
	if (isNullConstant(Y) &&
	(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
	SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
	SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	return Res;
	}

	Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
	CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);

	SDValue Res = // Res = 0 or -1.
	DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);

	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
	Res = DAG.getNOT(DL, Res, Res.getValueType());

	if (!isNullConstant(Op2))
	Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
	return Res;
	} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
	Cmp.getOperand(0).getOpcode() == ISD::AND &&
	isOneConstant(Cmp.getOperand(0).getOperand(1))) {
	SDValue CmpOp0 = Cmp.getOperand(0);
	SDValue Src1, Src2;
	// true if Op2 is XOR or OR operator and one of its operands
	// is equal to Op1
	// ( a , a op b) \|\| ( b , a op b)
	auto isOrXorPattern = [&]() {
	if ((Op2.getOpcode() == ISD::XOR \|\| Op2.getOpcode() == ISD::OR) &&
	(Op2.getOperand(0) == Op1 \|\| Op2.getOperand(1) == Op1)) {
	Src1 =
	Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
	Src2 = Op1;
	return true;
	}
	return false;
	};

	if (isOrXorPattern()) {
	SDValue Neg;
	unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
	// we need mask of all zeros or ones with same size of the other
	// operands.
	if (CmpSz > VT.getSizeInBits())
	Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
	else if (CmpSz < VT.getSizeInBits())
	Neg = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
	DAG.getConstant(1, DL, VT));
	else
	Neg = CmpOp0;
	SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	Neg); // -(and (x, 0x1))
	SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
	return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
	}
	}
	}

	// Look past (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	unsigned Opc = Cmp.getOpcode();
	MVT VT = Op.getSimpleValueType();

	bool IllegalFPCMov = false;
	if (VT.isFloatingPoint() && !VT.isVector() &&
	!isScalarFPTypeInSSEReg(VT)) // FPStack?
	IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

	if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) \|\|
	Opc == X86ISD::BT) { // FIXME
	Cond = Cmp;
	AddTest = false;
	}
	} else if (CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	((CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) &&
	Cond.getOperand(0).getValueType() != MVT::i8)) {
	SDValue LHS = Cond.getOperand(0);
	SDValue RHS = Cond.getOperand(1);
	unsigned X86Opcode;
	unsigned X86Cond;
	SDVTList VTs;
	switch (CondOpcode) {
	case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
	case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
	case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
	case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
	case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
	case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
	default: llvm_unreachable("unexpected overflowing operator");
	}
	if (CondOpcode == ISD::UMULO)
	VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
	MVT::i32);
	else
	VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

	SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);

	if (CondOpcode == ISD::UMULO)
	Cond = X86Op.getValue(2);
	else
	Cond = X86Op.getValue(1);

	CC = DAG.getConstant(X86Cond, DL, MVT::i8);
	AddTest = false;
	}

	if (AddTest) {
	// Look past the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
	CC = NewSetCC.getOperand(0);
	Cond = NewSetCC.getOperand(1);
	AddTest = false;
	}
	}
	}

	if (AddTest) {
	CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
	Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
	}

	// a < b ? -1 : 0 -> RES = ~setcc_carry
	// a < b ? 0 : -1 -> RES = setcc_carry
	// a >= b ? -1 : 0 -> RES = setcc_carry
	// a >= b ? 0 : -1 -> RES = ~setcc_carry
	if (Cond.getOpcode() == X86ISD::SUB) {
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

	if ((CondCode == X86::COND_AE \|\| CondCode == X86::COND_B) &&
	(isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(isNullConstant(Op1) \|\| isNullConstant(Op2))) {
	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Cond);
	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
	return DAG.getNOT(DL, Res, Res.getValueType());
	return Res;
	}
	}

	// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
	// widen the cmov and push the truncate through. This avoids introducing a new
	// branch during isel and doesn't add any extensions.
	if (Op.getValueType() == MVT::i8 &&
	Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
	SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
	if (T1.getValueType() == T2.getValueType() &&
	// Blacklist CopyFromReg to avoid partial register stalls.
	T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
	CC, Cond);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}
	}

	// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
	// condition is true.
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
	}

	static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	MVT VTElt = VT.getVectorElementType();
	SDLoc dl(Op);

	unsigned NumElts = VT.getVectorNumElements();

	// Extend VT if the scalar type is v8/v16 and BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)
	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, dl));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
	}

	SDValue V;
	MVT WideEltVT = WideVT.getVectorElementType();
	if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) \|\|
	(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
	V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG);
	} else {
	SDValue NegOne = getOnesVector(WideVT, DAG, dl);
	SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
	V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
	}

	// Truncate if we had to extend i16/i8 above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(VTElt, NumElts);
	V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
	DAG.getIntPtrConstant(0, dl));

	return V;
	}

	static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	if (Subtarget.hasFp256())
	if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
	return Res;

	return SDValue();
	}

	// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
	// For sign extend this needs to handle all vector sizes and SSE4.1 and
	// non-SSE4.1 targets. For zero extend this should only handle inputs of
	// MVT::v64i8 when BWI is not supported, but AVX512 is.
	static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT VT = Op->getSimpleValueType(0);
	MVT InVT = In.getSimpleValueType();
	assert(VT.getSizeInBits() == InVT.getSizeInBits());

	MVT SVT = VT.getVectorElementType();
	MVT InSVT = InVT.getVectorElementType();
	assert(SVT.getSizeInBits() > InSVT.getSizeInBits());

	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();
	if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
	!(VT.is256BitVector() && Subtarget.hasInt256()) &&
	!(VT.is512BitVector() && Subtarget.hasAVX512()))
	return SDValue();

	SDLoc dl(Op);

	// For 256-bit vectors, we only need the lower (128-bit) half of the input.
	// For 512-bit vectors, we need 128-bits or 256-bits.
	if (VT.getSizeInBits() > 128) {
	// Input needs to be at least the same number of elements as output, and
	// at least 128-bits.
	int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
	In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
	}

	assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");

	// SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
	// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
	// need to be handled here for 256/512-bit results.
	if (Subtarget.hasInt256()) {
	assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
	unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
	X86ISD::VSEXT : X86ISD::VZEXT;
	return DAG.getNode(ExtOpc, dl, VT, In);
	}

	// We should only get here for sign extend.
	assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
	"Unexpected opcode!");

	// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
	SDValue Curr = In;
	MVT CurrVT = InVT;

	// As SRAI is only available on i16/i32 types, we expand only up to i32
	// and handle i64 separately.
	while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
	Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
	MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
	CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
	Curr = DAG.getBitcast(CurrVT, Curr);
	}

	SDValue SignExt = Curr;
	if (CurrVT != InVT) {
	unsigned SignExtShift =
	CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
	SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
	DAG.getConstant(SignExtShift, dl, MVT::i8));
	}

	if (CurrVT == VT)
	return SignExt;

	if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
	SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
	DAG.getConstant(31, dl, MVT::i8));
	SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
	return DAG.getBitcast(VT, Ext);
	}

	return SDValue();
	}

	static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	if ((VT != MVT::v4i64 \|\| InVT != MVT::v4i32) &&
	(VT != MVT::v8i32 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i16 \|\| InVT != MVT::v16i8) &&
	(VT != MVT::v8i64 \|\| InVT != MVT::v8i32) &&
	(VT != MVT::v8i64 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i32 \|\| InVT != MVT::v16i16) &&
	(VT != MVT::v16i32 \|\| InVT != MVT::v16i8) &&
	(VT != MVT::v32i16 \|\| InVT != MVT::v32i8))
	return SDValue();

	if (Subtarget.hasInt256())
	return DAG.getNode(X86ISD::VSEXT, dl, VT, In);

	// Optimize vectors in AVX mode
	// Sign extend v8i16 to v8i32 and
	// v4i32 to v4i64
	//
	// Divide input vector into two parts
	// for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
	// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
	// concat the vectors to original VT

	unsigned NumElems = InVT.getVectorNumElements();
	SDValue Undef = DAG.getUNDEF(InVT);

	SmallVector<int,8> ShufMask1(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask1[i] = i;

	SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);

	SmallVector<int,8> ShufMask2(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask2[i] = i + NumElems/2;

	SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);

	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);

	OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
	OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	// Lower truncating store. We need a special lowering to vXi1 vectors
	static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
	SDLoc dl(St);
	EVT MemVT = St->getMemoryVT();
	assert(St->isTruncatingStore() && "We only custom truncating store.");
	assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
	"Expected truncstore of i1 vector");

	SDValue Op = St->getValue();
	MVT OpVT = Op.getValueType().getSimpleVT();
	unsigned NumElts = OpVT.getVectorNumElements();
	if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) \|\|
	NumElts == 16) {
	// Truncate and store - everything is legal
	Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
	if (MemVT.getSizeInBits() < 8)
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
	St->getMemOperand());
	}

	// A subset, assume that we have only AVX-512F
	if (NumElts <= 8) {
	if (NumElts < 8) {
	// Extend to 8-elts vector
	MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
	DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
	}
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
	Op = DAG.getBitcast(MVT::i8, Op);
	return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
	St->getMemOperand());
	}
	// v32i8
	assert(OpVT == MVT::v32i8 && "Unexpected operand type");
	// Divide the vector into 2 parts and store each part separately
	SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
	DAG.getIntPtrConstant(0, dl));
	Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
	SDValue BasePtr = St->getBasePtr();
	SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
	St->getMemOperand());
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
	DAG.getIntPtrConstant(16, dl));
	Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);

	SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);

	SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
	BasePtrHi, St->getPointerInfo().getWithOffset(2),
	MinAlign(St->getAlignment(), 2U),
	St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
	}

	static SDValue LowerExtended1BitVectorLoad(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);
	EVT MemVT = Ld->getMemoryVT();
	assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
	"Expected i1 vector load");
	unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
	ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	MVT VT = Op.getValueType().getSimpleVT();
	unsigned NumElts = VT.getVectorNumElements();

	if ((Subtarget.hasBWI() && NumElts >= 32) \|\|
	(Subtarget.hasDQI() && NumElts < 16) \|\|
	NumElts == 16) {
	// Load and extend - everything is legal
	if (NumElts < 8) {
	SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
	if (Subtarget.hasVLX()) {
	// Extract to v4i1/v2i1.
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Load,
	DAG.getIntPtrConstant(0, dl));
	// Finally, do a normal sign-extend to the desired register.
	return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
	}

	MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);

	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}
	SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	// Finally, do a normal sign-extend to the desired register.
	return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
	}

	if (NumElts <= 8) {
	// A subset, assume that we have only AVX-512F
	SDValue Load = DAG.getLoad(MVT::i8, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	SDValue BitVec = DAG.getBitcast(MVT::v8i1, Load);

	if (NumElts == 8)
	return DAG.getNode(ExtOpcode, dl, VT, BitVec);

	if (Subtarget.hasVLX()) {
	// Extract to v4i1/v2i1.
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, BitVec,
	DAG.getIntPtrConstant(0, dl));
	// Finally, do a normal sign-extend to the desired register.
	return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
	}

	MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(VT == MVT::v32i8 && "Unexpected extload type");

	SDValue BasePtr = Ld->getBasePtr();
	SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());

	SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);

	SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), BasePtrHi,
	Ld->getPointerInfo().getWithOffset(2),
	MinAlign(Ld->getAlignment(), 2U),
	Ld->getMemOperand()->getFlags());

	SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	LoadLo.getValue(1), LoadHi.getValue(1));
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);

	SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
	SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
	}

	// Lower vector extended loads using a shuffle. If SSSE3 is not available we
	// may emit an illegal shuffle but the expansion is still better than scalar
	// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
	// we'll emit a shuffle and a arithmetic shift.
	// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
	// TODO: It is possible to support ZExt by zeroing the undef values during
	// the shuffle phase or after the shuffle.
	static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT RegVT = Op.getSimpleValueType();
	assert(RegVT.isVector() && "We only custom lower vector sext loads.");
	assert(RegVT.isInteger() &&
	"We only custom lower integer vector sext loads.");

	// Nothing useful we can do without SSE2 shuffles.
	assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);
	EVT MemVT = Ld->getMemoryVT();
	if (MemVT.getScalarType() == MVT::i1)
	return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned RegSz = RegVT.getSizeInBits();

	ISD::LoadExtType Ext = Ld->getExtensionType();

	assert((Ext == ISD::EXTLOAD \|\| Ext == ISD::SEXTLOAD)
	&& "Only anyext and sext are currently implemented.");
	assert(MemVT != RegVT && "Cannot extend to the same type");
	assert(MemVT.isVector() && "Must load a vector from memory");

	unsigned NumElems = RegVT.getVectorNumElements();
	unsigned MemSz = MemVT.getSizeInBits();
	assert(RegSz > MemSz && "Register size must be greater than the mem size");

	if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
	// The only way in which we have a legal 256-bit vector result but not the
	// integer 256-bit operations needed to directly lower a sextload is if we
	// have AVX1 but not AVX2. In that case, we can always emit a sextload to
	// a 128-bit vector and a normal sign_extend to 256-bits that should get
	// correctly legalized. We do this late to allow the canonical form of
	// sextload to persist throughout the rest of the DAG combiner -- it wants
	// to fold together any extensions it can, and so will fuse a sign_extend
	// of an sextload into a sextload targeting a wider value.
	SDValue Load;
	if (MemSz == 128) {
	// Just switch this to a normal load.
	assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
	"it must be a legal 128-bit vector "
	"type!");
	Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	} else {
	assert(MemSz < 128 &&
	"Can't extend a type wider than 128 bits to a 256 bit vector!");
	// Do an sext load to a 128-bit vector type. We want to use the same
	// number of elements, but elements half as wide. This will end up being
	// recursively lowered by this routine, but will succeed as we definitely
	// have all the necessary features if we're using AVX1.
	EVT HalfEltVT =
	EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
	EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
	Load =
	DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	}

	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	// Finally, do a normal sign-extend to the desired register.
	return DAG.getSExtOrTrunc(Load, dl, RegVT);
	}

	// All sizes must be a power of two.
	assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
	"Non-power-of-two elements are not custom lowered!");

	// Attempt to load the original value using scalar loads.
	// Find the largest scalar type that divides the total loaded size.
	MVT SclrLoadTy = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
	SclrLoadTy = Tp;
	}
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
	(64 <= MemSz))
	SclrLoadTy = MVT::f64;

	// Calculate the number of scalar loads that we need to perform
	// in order to load our vector from memory.
	unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();

	assert((Ext != ISD::SEXTLOAD \|\| NumLoads == 1) &&
	"Can only lower sext loads with a single scalar load!");

	unsigned loadRegZize = RegSz;
	if (Ext == ISD::SEXTLOAD && RegSz >= 256)
	loadRegZize = 128;

	// If we don't have BWI we won't be able to create the shuffle needed for
	// v8i8->v8i64.
	if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
	MemVT == MVT::v8i8)
	loadRegZize = 128;

	// Represent our vector as a sequence of elements which are the
	// largest scalar that we can load.
	EVT LoadUnitVecVT = EVT::getVectorVT(
	*DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());

	// Represent the data using the same element type that is stored in
	// memory. In practice, we ''widen'' MemVT.
	EVT WideVecVT =
	EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	loadRegZize / MemVT.getScalarSizeInBits());

	assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
	"Invalid vector type");

	// We can't shuffle using an illegal type.
	assert(TLI.isTypeLegal(WideVecVT) &&
	"We only lower types that form legal widened vector types");

	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = Ld->getBasePtr();
	SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	SDValue Res = DAG.getUNDEF(LoadUnitVecVT);

	for (unsigned i = 0; i < NumLoads; ++i) {
	// Perform a single load.
	SDValue ScalarLoad =
	DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
	Ld->getAlignment(), Ld->getMemOperand()->getFlags());
	Chains.push_back(ScalarLoad.getValue(1));
	// Create the first element type using SCALAR_TO_VECTOR in order to avoid
	// another round of DAGCombining.
	if (i == 0)
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
	else
	Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
	ScalarLoad, DAG.getIntPtrConstant(i, dl));

	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
	}

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);

	// Bitcast the loaded value to a vector of the original element type, in
	// the size of the target vector type.
	SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
	unsigned SizeRatio = RegSz / MemSz;

	if (Ext == ISD::SEXTLOAD) {
	// If we have SSE4.1, we can directly emit a VSEXT node.
	if (Subtarget.hasSSE41()) {
	SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Sext;
	}

	// Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
	// lanes.
	assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
	"We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");

	SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Shuff;
	}

	if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
	MemVT == MVT::v8i8) {
	SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Sext;
	}

	// Redistribute the loaded elements into the different locations.
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i * SizeRatio] = i;

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
	DAG.getUNDEF(WideVecVT), ShuffleVec);

	// Bitcast to the requested type.
	Shuff = DAG.getBitcast(RegVT, Shuff);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Shuff;
	}

	/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
	/// each of which has no other use apart from the AND / OR.
	static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
	Opc = Op.getOpcode();
	if (Opc != ISD::OR && Opc != ISD::AND)
	return false;
	return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse() &&
	Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(1).hasOneUse());
	}

	/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
	/// SETCC node has a single use.
	static bool isXor1OfSetCC(SDValue Op) {
	if (Op.getOpcode() != ISD::XOR)
	return false;
	if (isOneConstant(Op.getOperand(1)))
	return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse();
	return false;
	}

	SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
	bool addTest = true;
	SDValue Chain = Op.getOperand(0);
	SDValue Cond = Op.getOperand(1);
	SDValue Dest = Op.getOperand(2);
	SDLoc dl(Op);
	SDValue CC;
	bool Inverted = false;

	if (Cond.getOpcode() == ISD::SETCC) {
	// Check for setcc([su]{add,sub,mul}o == 0).
	if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
	isNullConstant(Cond.getOperand(1)) &&
	Cond.getOperand(0).getResNo() == 1 &&
	(Cond.getOperand(0).getOpcode() == ISD::SADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SSUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::USUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SMULO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
	Inverted = true;
	Cond = Cond.getOperand(0);
	} else {
	if (SDValue NewCond = LowerSETCC(Cond, DAG))
	Cond = NewCond;
	}
	}
	#if 0
	// FIXME: LowerXALUO doesn't handle these!!
	else if (Cond.getOpcode() == X86ISD::ADD \|\|
	Cond.getOpcode() == X86ISD::SUB \|\|
	Cond.getOpcode() == X86ISD::SMUL \|\|
	Cond.getOpcode() == X86ISD::UMUL)
	Cond = LowerXALUO(Cond, DAG);
	#endif

	// Look pass (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	unsigned Opc = Cmp.getOpcode();
	// FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
	if (isX86LogicalCmp(Cmp) \|\| Opc == X86ISD::BT) {
	Cond = Cmp;
	addTest = false;
	} else {
	switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
	default: break;
	case X86::COND_O:
	case X86::COND_B:
	// These can only come from an arithmetic instruction with overflow,
	// e.g. SADDO, UADDO.
	Cond = Cond.getOperand(1);
	addTest = false;
	break;
	}
	}
	}
	CondOpcode = Cond.getOpcode();
	if (CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	((CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) &&
	Cond.getOperand(0).getValueType() != MVT::i8)) {
	SDValue LHS = Cond.getOperand(0);
	SDValue RHS = Cond.getOperand(1);
	unsigned X86Opcode;
	unsigned X86Cond;
	SDVTList VTs;
	// Keep this in sync with LowerXALUO, otherwise we might create redundant
	// instructions that can't be removed afterwards (i.e. X86ISD::ADD and
	// X86ISD::INC).
	switch (CondOpcode) {
	case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
	case ISD::SADDO:
	if (isOneConstant(RHS)) {
	X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
	break;
	}
	X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
	case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
	case ISD::SSUBO:
	if (isOneConstant(RHS)) {
	X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
	break;
	}
	X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
	case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
	case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
	default: llvm_unreachable("unexpected overflowing operator");
	}
	if (Inverted)
	X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
	if (CondOpcode == ISD::UMULO)
	VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
	MVT::i32);
	else
	VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

	SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);

	if (CondOpcode == ISD::UMULO)
	Cond = X86Op.getValue(2);
	else
	Cond = X86Op.getValue(1);

	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	addTest = false;
	} else {
	unsigned CondOpc;
	if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
	SDValue Cmp = Cond.getOperand(0).getOperand(1);
	if (CondOpc == ISD::OR) {
	// Also, recognize the pattern generated by an FCMP_UNE. We can emit
	// two branches instead of an explicit OR instruction with a
	// separate test.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp)) {
	CC = Cond.getOperand(0).getOperand(0);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = Cond.getOperand(1).getOperand(0);
	Cond = Cmp;
	addTest = false;
	}
	} else { // ISD::AND
	// Also, recognize the pattern generated by an FCMP_OEQ. We can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp) &&
	Op.getNode()->hasOneUse()) {
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	}
	} else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
	// Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
	// It should be transformed during dag combiner except when the condition
	// is set by a arithmetics with overflow node.
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cond.getOperand(0).getOperand(1);
	addTest = false;
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
	// For FCMP_OEQ, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
	// For FCMP_UNE, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_UNE.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;

	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	Dest = FalseBB;
	}
	}
	}
	}

	if (addTest) {
	// Look pass the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
	CC = NewSetCC.getOperand(0);
	Cond = NewSetCC.getOperand(1);
	addTest = false;
	}
	}
	}

	if (addTest) {
	X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	Cond = EmitTest(Cond, X86Cond, dl, DAG);
	}
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cond);
	}

	// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
	// Calls to _alloca are needed to probe the stack when allocating more than 4k
	// bytes in one go. Touching the stack at 4K increments is necessary to ensure
	// that the guard pages used by the OS virtual memory manager are allocated in
	// correct sequence.
	SDValue
	X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool SplitStack = MF.shouldSplitStack();
	bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
	bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) \|\|
	SplitStack \|\| EmitStackProbe;
	SDLoc dl(Op);

	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	EVT VT = Node->getValueType(0);

	// Chain the dynamic stack allocation so that it doesn't modify the stack
	// pointer when other instructions are using the stack.
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	bool Is64Bit = Subtarget.is64Bit();
	MVT SPTy = getPointerTy(DAG.getDataLayout());

	SDValue Result;
	if (!Lower) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
	assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
	" not tell us which reg is the stack pointer!");

	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
	Chain = SP.getValue(1);
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlign = TFI.getStackAlignment();
	Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
	if (Align > StackAlign)
	Result = DAG.getNode(ISD::AND, dl, VT, Result,
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
	} else if (SplitStack) {
	MachineRegisterInfo &MRI = MF.getRegInfo();

	if (Is64Bit) {
	// The 64 bit implementation of segmented stacks needs to clobber both r10
	// r11. This makes it impossible to use it along with nested parameters.
	const Function &F = MF.getFunction();
	for (const auto &A : F.args()) {
	if (A.hasNestAttr())
	report_fatal_error("Cannot use segmented stacks with functions that "
	"have nested arguments.");
	}
	}

	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
	Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
	Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
	DAG.getRegister(Vreg, SPTy));
	} else {
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
	MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned SPReg = RegInfo->getStackRegister();
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
	Chain = SP.getValue(1);

	if (Align) {
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
	}

	Result = SP;
	}

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {Result, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SDLoc DL(Op);

	if (!Subtarget.is64Bit() \|\|
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	// __va_list_tag:
	// gp_offset (0 - 6 * 8)
	// fp_offset (48 - 48 + 8 * 16)
	// overflow_arg_area (point to parameters coming in memory).
	// reg_save_area
	SmallVector<SDValue, 8> MemOps;
	SDValue FIN = Op.getOperand(1);
	// Store gp_offset
	SDValue Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV));
	MemOps.push_back(Store);

	// Store fp_offset
	FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
	Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV, 4));
	MemOps.push_back(Store);

	// Store ptr to overflow_arg_area
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
	SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	Store =
	DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
	MemOps.push_back(Store);

	// Store ptr to reg_save_area.
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
	Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
	Store = DAG.getStore(
	Op.getOperand(0), DL, RSFIN, FIN,
	MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
	MemOps.push_back(Store);
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.is64Bit() &&
	"LowerVAARG only handles 64-bit va_arg!");
	assert(Op.getNumOperands() == 4);

	MachineFunction &MF = DAG.getMachineFunction();
	if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
	// The Win64 ABI uses char* instead of a structure.
	return DAG.expandVAArg(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue SrcPtr = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	unsigned Align = Op.getConstantOperandVal(3);
	SDLoc dl(Op);

	EVT ArgVT = Op.getNode()->getValueType(0);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
	uint8_t ArgMode;

	// Decide which area this value should be read from.
	// TODO: Implement the AMD64 ABI in its entirety. This simple
	// selection mechanism works only for the basic types.
	if (ArgVT == MVT::f80) {
	llvm_unreachable("va_arg for f80 not yet implemented");
	} else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /bytes/) {
	ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
	} else if (ArgVT.isInteger() && ArgSize <= 32 /bytes/) {
	ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
	} else {
	llvm_unreachable("Unhandled argument type in LowerVAARG");
	}

	if (ArgMode == 2) {
	// Sanity Check: Make sure using fp_offset makes sense.
	assert(!Subtarget.useSoftFloat() &&
	!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
	Subtarget.hasSSE1());
	}

	// Insert VAARG_64 node into the DAG
	// VAARG_64 returns two values: Variable Argument Address, Chain
	SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
	DAG.getConstant(ArgMode, dl, MVT::i8),
	DAG.getConstant(Align, dl, MVT::i32)};
	SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
	SDValue VAARG = DAG.getMemIntrinsicNode(
	X86ISD::VAARG_64, dl,
	VTs, InstOps, MVT::i64,
	MachinePointerInfo(SV),
	/Align=/0,
	MachineMemOperand::MOLoad \| MachineMemOperand::MOStore);
	Chain = VAARG.getValue(1);

	// Load the next argument and return it
	return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
	}

	static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// X86-64 va_list is a struct { i32, i32, i8, i8 }, except on Windows,
	// where a va_list is still an i8*.
	assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
	if (Subtarget.isCallingConvWin64(
	DAG.getMachineFunction().getFunction().getCallingConv()))
	// Probably a Win64 va_copy.
	return DAG.expandVACopy(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue DstPtr = Op.getOperand(1);
	SDValue SrcPtr = Op.getOperand(2);
	const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	SDLoc DL(Op);

	return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
	DAG.getIntPtrConstant(24, DL), 8, /isVolatile/false,
	false, false,
	MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
	}

	/// Handle vector element shifts where the shift amount is a constant.
	/// Takes immediate version of shift as input.
	static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, uint64_t ShiftAmt,
	SelectionDAG &DAG) {
	MVT ElementType = VT.getVectorElementType();

	// Bitcast the source vector to the output type, this is mainly necessary for
	// vXi8/vXi64 shifts.
	if (VT != SrcOp.getSimpleValueType())
	SrcOp = DAG.getBitcast(VT, SrcOp);

	// Fold this packed shift into its first operand if ShiftAmt is 0.
	if (ShiftAmt == 0)
	return SrcOp;

	// Check for ShiftAmt >= element width
	if (ShiftAmt >= ElementType.getSizeInBits()) {
	if (Opc == X86ISD::VSRAI)
	ShiftAmt = ElementType.getSizeInBits() - 1;
	else
	return DAG.getConstant(0, dl, VT);
	}

	assert((Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD::VSRAI)
	&& "Unknown target vector shift-by-constant node");

	// Fold this packed vector shift into a build vector if SrcOp is a
	// vector of Constants or UNDEFs.
	if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
	SmallVector<SDValue, 8> Elts;
	unsigned NumElts = SrcOp->getNumOperands();
	ConstantSDNode *ND;

	switch(Opc) {
	default: llvm_unreachable("Unknown opcode!");
	case X86ISD::VSHLI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRLI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRAI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
	}
	break;
	}

	return DAG.getBuildVector(VT, dl, Elts);
	}

	return DAG.getNode(Opc, dl, VT, SrcOp,
	DAG.getConstant(ShiftAmt, dl, MVT::i8));
	}

	/// Handle vector element shifts where the shift amount may or may not be a
	/// constant. Takes immediate version of shift as input.
	static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, SDValue ShAmt,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SVT = ShAmt.getSimpleValueType();
	assert((SVT == MVT::i32 \|\| SVT == MVT::i64) && "Unexpected value type!");

	// Catch shift-by-constant.
	if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
	return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
	CShAmt->getZExtValue(), DAG);

	// Change opcode to non-immediate version
	switch (Opc) {
	default: llvm_unreachable("Unknown target vector shift node");
	case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
	case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
	case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
	}

	// Need to build a vector containing shift amount.
	// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
	// +=================+============+=======================================+
	// \| ShAmt is \| HasSSE4.1? \| Construct ShAmt vector as \|
	// +=================+============+=======================================+
	// \| i64 \| Yes, No \| Use ShAmt as lowest elt \|
	// \| i32 \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16)) \| Yes \| zero-extend in-reg \|
	// \| i16/i32 \| No \| v4i32 build_vector(ShAmt, 0, ud, ud)) \|
	// +=================+============+=======================================+

	if (SVT == MVT::i64)
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
	else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
	ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
	ShAmt = ShAmt.getOperand(0);
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
	ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
	} else if (Subtarget.hasSSE41() &&
	ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
	ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
	} else {
	SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
	DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
	ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
	}

	// The return type has to be a 128-bit type with the same element
	// type as the input type.
	MVT EltVT = VT.getVectorElementType();
	MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());

	ShAmt = DAG.getBitcast(ShVT, ShAmt);
	return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
	}

	/// \brief Return Mask with the necessary casting or extending
	/// for \p Mask according to \p MaskVT when lowering masking intrinsics
	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {

	if (isAllOnesConstant(Mask))
	return DAG.getConstant(1, dl, MaskVT);
	if (X86::isZeroNode(Mask))
	return DAG.getConstant(0, dl, MaskVT);

	if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
	// Mask should be extended
	Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
	MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
	}

	if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
	if (MaskVT == MVT::v64i1) {
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	// In case 32bit mode, bitcast i64 is illegal, extend/split it.
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(0, dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(1, dl, MVT::i32));

	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	Hi = DAG.getBitcast(MVT::v32i1, Hi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	} else {
	// MaskVT require < 64bit. Truncate mask (should succeed in any case),
	// and bitcast.
	MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
	return DAG.getBitcast(MaskVT,
	DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
	}

	} else {
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
	// are extracted by EXTRACT_SUBVECTOR.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
	DAG.getBitcast(BitcastVT, Mask),
	DAG.getIntPtrConstant(0, dl));
	}
	}

	/// \brief Return (and \p Op, \p Mask) for compare instructions or
	/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
	/// necessary casting or extending for \p Mask when lowering masking intrinsics
	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	unsigned OpcodeSelect = ISD::VSELECT;
	SDLoc dl(Op);

	if (isAllOnesConstant(Mask))
	return Op;

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	switch (Op.getOpcode()) {
	default: break;
	case X86ISD::CMPM:
	case X86ISD::CMPM_RND:
	case X86ISD::CMPMU:
	case X86ISD::VPSHUFBITQMB:
	return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
	case X86ISD::VFPCLASS:
	return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
	case X86ISD::VTRUNC:
	case X86ISD::VTRUNCS:
	case X86ISD::VTRUNCUS:
	case X86ISD::CVTPS2PH:
	// We can't use ISD::VSELECT here because it is not always "Legal"
	// for the destination type. For example vpmovqb require only AVX512
	// and vselect that can operate on byte element type require BWI
	OpcodeSelect = X86ISD::SELECT;
	break;
	}
	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
	}

	/// \brief Creates an SDNode for a predicated scalar operation.
	/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
	/// The mask is coming as MVT::i8 and it should be transformed
	/// to MVT::v1i1 while lowering masking intrinsics.
	/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
	/// "X86select" instead of "vselect". We just can't create the "vselect" node
	/// for a scalar instruction.
	static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
	if (MaskConst->getZExtValue() & 0x1)
	return Op;

	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
	if (Op.getOpcode() == X86ISD::FSETCCM \|\|
	Op.getOpcode() == X86ISD::FSETCCM_RND)
	return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
	if (Op.getOpcode() == X86ISD::VFPCLASSS)
	return DAG.getNode(ISD::OR, dl, VT, Op, IMask);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
	}

	static int getSEHRegistrationNodeSize(const Function *Fn) {
	if (!Fn->hasPersonalityFn())
	report_fatal_error(
	"querying registration node size for function without personality");
	// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
	// WinEHStatePass for the full struct definition.
	switch (classifyEHPersonality(Fn->getPersonalityFn())) {
	case EHPersonality::MSVC_X86SEH: return 24;
	case EHPersonality::MSVC_CXX: return 16;
	default: break;
	}
	report_fatal_error(
	"can only recover FP for 32-bit MSVC EH personality functions");
	}

	/// When the MSVC runtime transfers control to us, either to an outlined
	/// function or when returning to a parent frame after catching an exception, we
	/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
	/// Here's the math:
	/// RegNodeBase = EntryEBP - RegNodeSize
	/// ParentFP = RegNodeBase - ParentFrameOffset
	/// Subtracting RegNodeSize takes us to the offset of the registration node, and
	/// subtracting the offset (negative on x86) takes us back to the parent FP.
	static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
	SDValue EntryEBP) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDLoc dl;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

	// It's possible that the parent function no longer has a personality function
	// if the exceptional code was optimized away, in which case we just return
	// the incoming EBP.
	if (!Fn->hasPersonalityFn())
	return EntryEBP;

	// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
	// registration, or the .set_setframe offset.
	MCSymbol *OffsetSym =
	MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));
	SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
	SDValue ParentFrameOffset =
	DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

	// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
	// prologue to RBP in the parent function.
	const X86Subtarget &Subtarget =
	static_cast<const X86Subtarget &>(DAG.getSubtarget());
	if (Subtarget.is64Bit())
	return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

	int RegNodeSize = getSEHRegistrationNodeSize(Fn);
	// RegNodeBase = EntryEBP - RegNodeSize
	// ParentFP = RegNodeBase - ParentFrameOffset
	SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
	DAG.getConstant(RegNodeSize, dl, PtrVT));
	return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
	}

	SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	// Helper to detect if the operand is CUR_DIRECTION rounding mode.
	auto isRoundModeCurDirection = [](SDValue Rnd) {
	if (!isa<ConstantSDNode>(Rnd))
	return false;

	unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
	return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
	};

	SDLoc dl(Op);
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	MVT VT = Op.getSimpleValueType();
	const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
	if (IntrData) {
	switch(IntrData->Type) {
	case INTR_TYPE_1OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
	case INTR_TYPE_2OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2));
	case INTR_TYPE_3OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3));
	case INTR_TYPE_4OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
	case INTR_TYPE_1OP_MASK_RM: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue RoundingMode;
	// We always add rounding mode to the Node.
	// If the rounding mode is not specified, we add the
	// "current direction" mode.
	if (Op.getNumOperands() == 4)
	RoundingMode =
	DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	else
	RoundingMode = Op.getOperand(4);
	assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
	RoundingMode),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_1OP_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RM Opcode is specified and
	// - RM is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	// There are 2 kinds of intrinsics in this group:
	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
	// (2) With rounding mode and sae - 7 operands.
	bool HasRounding = IntrWithRoundingModeOpcode != 0;
	if (Op.getNumOperands() == (5U + HasRounding)) {
	if (HasRounding) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, VT, Src1, Src2, Rnd),
	Mask, passThru, Subtarget, DAG);
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2),
	Mask, passThru, Subtarget, DAG);
	}

	assert(Op.getNumOperands() == (6U + HasRounding) &&
	"Unexpected intrinsic form");
	SDValue RoundingMode = Op.getOperand(5);
	if (HasRounding) {
	SDValue Sae = Op.getOperand(6);
	if (!isRoundModeCurDirection(Sae))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, VT, Src1, Src2,
	RoundingMode, Sae),
	Mask, passThru, Subtarget, DAG);
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2, RoundingMode),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src0 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	// There are 2 kinds of intrinsics in this group:
	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
	// (2) With rounding mode and sae - 7 operands.
	if (Op.getNumOperands() == 6) {
	SDValue Sae = Op.getOperand(5);
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
	Sae),
	Mask, Src0, Subtarget, DAG);
	}
	assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
	SDValue RoundingMode = Op.getOperand(5);
	SDValue Sae = Op.getOperand(6);
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
	RoundingMode, Sae),
	Mask, Src0, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK:
	case INTR_TYPE_2OP_IMM8_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
	Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	// TODO: Intrinsics should have fast-math-flags to propagate.
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (6 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 6)
	Rnd = Op.getOperand(5);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_SCALAR_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);

	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(6);
	if (!isRoundModeCurDirection(Rnd))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, VT, Src1, Src2, Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Imm = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (7 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 7)
	Rnd = Op.getOperand(6);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Imm, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_IMM8_MASK:
	case INTR_TYPE_3OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);

	if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
	Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(6);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case VPERM_2OP_MASK : {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	// Swap Src1 and Src2 in the node creation
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
	Mask, PassThru, Subtarget, DAG);
	}
	case VPERM_3OP_MASKZ:
	case VPERM_3OP_MASK:{
	MVT VT = Op.getSimpleValueType();
	// Src2 is the PassThru
	SDValue Src1 = Op.getOperand(1);
	// PassThru needs to be the same type as the destination in order
	// to pattern match correctly.
	SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == VPERM_3OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else
	PassThru = Src2;

	// Swap Src1 and Src2 in the node creation
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
	dl, Op.getValueType(),
	Src2, Src1, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case FMA_OP_MASK3:
	case FMA_OP_MASKZ:
	case FMA_OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == FMA_OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else if (IntrData->Type == FMA_OP_MASK3)
	PassThru = Src3;
	else
	PassThru = Src1;

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
	dl, Op.getValueType(),
	Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case FMA_OP_SCALAR_MASK:
	case FMA_OP_SCALAR_MASK3:
	case FMA_OP_SCALAR_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
	PassThru = Src3;
	else
	PassThru = Src1;

	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
	Op.getValueType(), Src1, Src2,
	Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}

	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
	Op.getValueType(), Src1, Src2,
	Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case IFMA_OP_MASKZ:
	case IFMA_OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = Src1;

	// set PassThru element
	if (IntrData->Type == IFMA_OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);

	// Node we need to swizzle the operands to pass the multiply operands
	// first.
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
	dl, Op.getValueType(),
	Src2, Src3, Src1),
	Mask, PassThru, Subtarget, DAG);
	}
	case TERLOG_OP_MASK:
	case TERLOG_OP_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
	SDValue Mask = Op.getOperand(5);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = Src1;
	// Set PassThru element.
	if (IntrData->Type == TERLOG_OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);

	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Src4),
	Mask, PassThru, Subtarget, DAG);
	}
	case CVTPD2PS:
	// ISD::FP_ROUND has a second argument that indicates if the truncation
	// does not change the value. Set it to 0 since it can change.
	return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
	DAG.getIntPtrConstant(0, dl));
	case CVTPD2PS_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RM Opcode is specified and
	// - RM is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
	// ISD::FP_ROUND has a second argument that indicates if the truncation
	// does not change the value. Set it to 0 since it can change.
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
	DAG.getIntPtrConstant(0, dl)),
	Mask, PassThru, Subtarget, DAG);
	}
	case FPCLASS: {
	// FPclass intrinsics with mask
	SDValue Src1 = Op.getOperand(1);
	MVT VT = Src1.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
	SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(),
	Subtarget, DAG);
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), FPclassMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case FPCLASSS: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
	SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
	Subtarget, DAG);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,
	DAG.getIntPtrConstant(0, dl));
	}
	case CMP_MASK:
	case CMP_MASK_CC: {
	// Comparison intrinsics with masks.
	// Example of transformation:
	// (i8 (int_x86_avx512_mask_pcmpeq_q_128
	// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
	// (i8 (bitcast
	// (v8i1 (insert_subvector undef,
	// (v2i1 (and (PCMPEQM %a, %b),
	// (extract_subvector
	// (v8i1 (bitcast %mask)), 0))), 0))))
	MVT VT = Op.getOperand(1).getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	SDValue Cmp;
	if (IntrData->Type == CMP_MASK_CC) {
	SDValue CC = Op.getOperand(3);
	CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC, Rnd);
	}
	//default rounding mode
	if(!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC);

	} else {
	assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
	Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2));
	}
	SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
	Subtarget, DAG);
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), CmpMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case CMP_MASK_SCALAR_CC: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
	SDValue Mask = Op.getOperand(4);

	SDValue Cmp;
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
	}
	//default rounding mode
	if(!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

	SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
	Subtarget, DAG);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask,
	DAG.getIntPtrConstant(0, dl));
	}
	case COMI: { // Comparison intrinsics
	ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
	SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
	SDValue SetCC;
	switch (CC) {
	case ISD::SETEQ: { // (ZF = 0 and PF = 0)
	SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
	SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
	break;
	}
	case ISD::SETNE: { // (ZF = 1 or PF = 1)
	SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
	SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
	break;
	}
	case ISD::SETGT: // (CF = 0 and ZF = 0)
	SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
	break;
	case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
	SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
	break;
	}
	case ISD::SETGE: // CF = 0
	SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
	break;
	case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
	SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
	break;
	default:
	llvm_unreachable("Unexpected illegal condition!");
	}
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case COMI_RM: { // Comparison intrinsics with Sae
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	SDValue Sae = Op.getOperand(4);

	SDValue FCmp;
	if (isRoundModeCurDirection(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8));
	else
	FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8), Sae);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp,
	DAG.getIntPtrConstant(0, dl));
	}
	case VSHIFT:
	return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
	Op.getOperand(1), Op.getOperand(2), Subtarget,
	DAG);
	case COMPRESS_EXPAND_IN_REG: {
	SDValue Mask = Op.getOperand(3);
	SDValue DataToCompress = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	if (isAllOnesConstant(Mask)) // return data as is
	return Op.getOperand(1);

	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	DataToCompress),
	Mask, PassThru, Subtarget, DAG);
	}
	case BROADCASTM: {
	SDValue Mask = Op.getOperand(1);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	Mask = DAG.getBitcast(MaskVT, Mask);
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
	}
	case MASK_BINOP: {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());

	SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
	SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
	SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
	return DAG.getBitcast(VT, Res);
	}
	case FIXUPIMMS:
	case FIXUPIMMS_MASKZ:
	case FIXUPIMM:
	case FIXUPIMM_MASKZ:{
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Imm = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Passthru = (IntrData->Type == FIXUPIMM \|\| IntrData->Type == FIXUPIMMS ) ?
	Src1 : getZeroVector(VT, Subtarget, DAG, dl);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (7 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 7)
	Rnd = Op.getOperand(6);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	if (IntrData->Type == FIXUPIMM \|\| IntrData->Type == FIXUPIMM_MASKZ)
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Imm, Rnd),
	Mask, Passthru, Subtarget, DAG);
	else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Imm, Rnd),
	Mask, Passthru, Subtarget, DAG);
	}
	case CONVERT_TO_MASK: {
	MVT SrcVT = Op.getOperand(1).getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());

	SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
	Op.getOperand(1));
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), CvtMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case ROUNDP: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
	Op.getOperand(2),
	DAG.getConstant(0xf, dl, MVT::i32));
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), RoundingMode);
	}
	case ROUNDS: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
	Op.getOperand(3),
	DAG.getConstant(0xf, dl, MVT::i32));
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), RoundingMode);
	}
	default:
	break;
	}
	}

	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.

	case Intrinsic::x86_avx2_permd:
	case Intrinsic::x86_avx2_permps:
	// Operands intentionally swapped. Mask is last operand to intrinsic,
	// but second operand for node/instruction.
	return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
	Op.getOperand(2), Op.getOperand(1));

	// ptest and testp intrinsics. The intrinsic these come from are designed to
	// return an integer value, not just an instruction so lower it to the ptest
	// or testp pattern and a setcc for the result.
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestz_256:
	case Intrinsic::x86_avx_ptestc_256:
	case Intrinsic::x86_avx_ptestnzc_256:
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256: {
	bool IsTestPacked = false;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_avx_ptestz_256:
	// ZF = 1
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_avx_ptestc_256:
	// CF = 1
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestnzc_256:
	// ZF and CF = 0
	X86CC = X86::COND_A;
	break;
	}

	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
	SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case Intrinsic::x86_avx512_kortestz_w:
	case Intrinsic::x86_avx512_kortestc_w: {
	X86::CondCode X86CC =
	(IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_avx512_knot_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
	SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_avx512_kandn_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	// Invert LHS for the not.
	LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
	DAG.getConstant(1, dl, MVT::v16i1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_avx512_kxnor_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
	// Invert result for the not.
	Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
	DAG.getConstant(1, dl, MVT::v16i1));
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_sse42_pcmpistria128:
	case Intrinsic::x86_sse42_pcmpestria128:
	case Intrinsic::x86_sse42_pcmpistric128:
	case Intrinsic::x86_sse42_pcmpestric128:
	case Intrinsic::x86_sse42_pcmpistrio128:
	case Intrinsic::x86_sse42_pcmpestrio128:
	case Intrinsic::x86_sse42_pcmpistris128:
	case Intrinsic::x86_sse42_pcmpestris128:
	case Intrinsic::x86_sse42_pcmpistriz128:
	case Intrinsic::x86_sse42_pcmpestriz128: {
	unsigned Opcode;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::x86_sse42_pcmpistria128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpestria128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpistric128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpestric128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpistrio128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpestrio128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpistris128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpestris128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpistriz128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_sse42_pcmpestriz128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_E;
	break;
	}
	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
	SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistri128:
	case Intrinsic::x86_sse42_pcmpestri128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
	Opcode = X86ISD::PCMPISTRI;
	else
	Opcode = X86ISD::PCMPESTRI;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps);
	}

	case Intrinsic::eh_sjlj_lsda: {
	MachineFunction &MF = DAG.getMachineFunction();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	auto &Context = MF.getMMI().getContext();
	MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
	Twine(MF.getFunctionNumber()));
	return DAG.getNode(getGlobalWrapperKind(), dl, VT,
	DAG.getMCSymbol(S, PtrVT));
	}

	case Intrinsic::x86_seh_lsda: {
	// Compute the symbol for the LSDA. We know it'll get emitted later.
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Op1 = Op.getOperand(1);
	auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
	MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));

	// Generate a simple absolute symbol reference. This intrinsic is only
	// supported on 32-bit Windows, which isn't PIC.
	SDValue Result = DAG.getMCSymbol(LSDASym, VT);
	return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
	}

	case Intrinsic::x86_seh_recoverfp: {
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.x86.seh.recoverfp must take a function as the first argument");
	return recoverFramePointer(DAG, Fn, IncomingFPOp);
	}

	case Intrinsic::localaddress: {
	// Returns one of the stack, base, or frame pointer registers, depending on
	// which is used to reference local variables.
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned Reg;
	if (RegInfo->hasBasePointer(MF))
	Reg = RegInfo->getBaseRegister();
	else // This function handles the SP or FP case.
	Reg = RegInfo->getPtrSizedFrameRegister(MF);
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
	}
	}
	}

	static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	EVT MaskVT = Mask.getValueType();
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let ExecutionDepsFix deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
	SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Index.getSimpleValueType().getVectorNumElements());

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let ExecutionDepsFix deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(VMask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
	SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Index.getSimpleValueType().getVectorNumElements());

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
	SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	return SDValue(Res, 1);
	}

	static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Mask, SDValue Base, SDValue Index,
	SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT =
	MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	/// Handles the lowering of builtin intrinsic that return the value
	/// of the extended control register.
	static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue LO, HI;

	// The ECX register is used to select the index of the XCR register to
	// return.
	SDValue Chain =
	DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
	SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
	Chain = SDValue(N1, 0);

	// Reads the content of XCR and returns it in registers EDX:EAX.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);

	if (Subtarget.is64Bit()) {
	// Merge the two 32-bit values into a 64-bit one..
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	/// Handles the lowering of builtin intrinsics that read performance monitor
	/// counters (x86_rdpmc).
	static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue LO, HI;

	// The ECX register is used to select the index of the performance counter
	// to read.
	SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
	N->getOperand(2));
	SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);

	// Reads the content of a 64-bit performance counter and returns it in the
	// registers EDX:EAX.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);

	if (Subtarget.is64Bit()) {
	// The EAX register is loaded with the low-order 32 bits. The EDX register
	// is loaded with the supported high-order bits of the counter.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	/// Handles the lowering of builtin intrinsics that read the time stamp counter
	/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
	/// READCYCLECOUNTER nodes.
	static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
	SDValue LO, HI;

	// The processor's time-stamp counter (a 64-bit MSR) is stored into the
	// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
	// and the EAX register is loaded with the low-order 32 bits.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	SDValue Chain = HI.getValue(1);

	if (Opcode == X86ISD::RDTSCP_DAG) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");

	// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
	// the ECX register. Add 'ecx' explicitly to the chain.
	SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
	HI.getValue(2));
	// Explicitly store the content of ECX at the location passed in input
	// to the 'rdtscp' intrinsic.
	Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
	MachinePointerInfo());
	}

	if (Subtarget.is64Bit()) {
	// The EDX register is loaded with the high-order 32 bits of the MSR, and
	// the EAX register is loaded with the low-order 32 bits.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<SDValue, 2> Results;
	SDLoc DL(Op);
	getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, DL);
	}

	static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue RegNode = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EH registrations only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
	EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue EHGuard = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EHGuard only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
	EHInfo->EHGuardFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	/// Emit Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
	SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
	SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	return SignedSat ?
	DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	/// Emit Masked Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
	SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
	MachineMemOperand *MMO, SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { Chain, Ptr, Mask, Val };
	return SignedSat ?
	DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

	const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
	if (!IntrData) {
	switch (IntNo) {
	case llvm::Intrinsic::x86_seh_ehregnode:
	return MarkEHRegistrationNode(Op, DAG);
	case llvm::Intrinsic::x86_seh_ehguard:
	return MarkEHGuard(Op, DAG);
	case llvm::Intrinsic::x86_flags_read_u32:
	case llvm::Intrinsic::x86_flags_read_u64:
	case llvm::Intrinsic::x86_flags_write_u32:
	case llvm::Intrinsic::x86_flags_write_u64: {
	// We need a frame pointer because this will get lowered to a PUSH/POP
	// sequence.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	// Don't do anything here, we will expand these intrinsics out later
	// during ExpandISelPseudos in EmitInstrWithCustomInserter.
	return SDValue();
	}
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64: {
	SDLoc dl(Op);
	SDValue Chain = Op->getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	SDValue LwpIns =
	DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
	Op->getOperand(3), Op->getOperand(4));
	SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
	SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
	LwpIns.getValue(1));
	}
	}
	return SDValue();
	}

	SDLoc dl(Op);
	switch(IntrData->Type) {
	default: llvm_unreachable("Unknown Intrinsic Type");
	case RDSEED:
	case RDRAND: {
	// Emit the node with the right value type.
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
	SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
	// Otherwise return the value from Rand, which is always 0, casted to i32.
	SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
	DAG.getConstant(1, dl, Op->getValueType(1)),
	DAG.getConstant(X86::COND_B, dl, MVT::i8),
	SDValue(Result.getNode(), 1) };
	SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

	// Return { result, isValid, chain }.
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
	SDValue(Result.getNode(), 2));
	}
	case GATHER_AVX2: {
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case GATHER: {
	//gather(v1, mask, index, base, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
	Chain, Subtarget);
	}
	case SCATTER: {
	//scatter(base, mask, index, v1, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Base = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Src = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case PREFETCH: {
	SDValue Hint = Op.getOperand(6);
	unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
	assert((HintVal == 2 \|\| HintVal == 3) &&
	"Wrong prefetch hint in intrinsic: should be 2 or 3");
	unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
	SDValue Chain = Op.getOperand(0);
	SDValue Mask = Op.getOperand(2);
	SDValue Index = Op.getOperand(3);
	SDValue Base = Op.getOperand(4);
	SDValue Scale = Op.getOperand(5);
	return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
	Subtarget);
	}
	// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
	case RDTSC: {
	SmallVector<SDValue, 2> Results;
	getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Read Performance Monitoring Counters.
	case RDPMC: {
	SmallVector<SDValue, 2> Results;
	getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Get Extended Control Register.
	case XGETBV: {
	SmallVector<SDValue, 2> Results;
	getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// XTEST intrinsics.
	case XTEST: {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
	SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
	SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
	Ret, SDValue(InTrans.getNode(), 1));
	}
	// ADC/ADCX/SBB
	case ADX: {
	SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
	SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
	SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
	DAG.getConstant(-1, dl, MVT::i8));
	SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
	Op.getOperand(4), GenCF.getValue(1));
	SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
	Op.getOperand(5), MachinePointerInfo());
	SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
	SDValue Results[] = { SetCC, Store };
	return DAG.getMergeValues(Results, dl);
	}
	case COMPRESS_TO_MEM: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToCompress = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);
	MVT VT = DataToCompress.getSimpleValueType();

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	if (isAllOnesConstant(Mask)) // return just a store
	return DAG.getStore(Chain, dl, DataToCompress, Addr,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
	MemIntr->getMemOperand(),
	false /* truncating /, true / compressing */);
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToTruncate = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	EVT MemVT = MemIntr->getMemoryVT();

	uint16_t TruncationOp = IntrData->Opc0;
	switch (TruncationOp) {
	case X86ISD::VTRUNC: {
	if (isAllOnesConstant(Mask)) // return just a truncate store
	return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
	MemIntr->getMemOperand(), true /* truncating */);
	}
	case X86ISD::VTRUNCUS:
	case X86ISD::VTRUNCS: {
	bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
	if (isAllOnesConstant(Mask))
	return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand(), DAG);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
	VMask, MemVT, MemIntr->getMemOperand(), DAG);
	}
	default:
	llvm_unreachable("Unsupported truncstore intrinsic");
	}
	}

	case EXPAND_FROM_MEM: {
	SDValue Mask = Op.getOperand(4);
	SDValue PassThru = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);
	MVT VT = Op.getSimpleValueType();

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
	return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
	if (X86::isZeroNode(Mask))
	return DAG.getUNDEF(VT);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
	MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
	true /* expanding */);
	}
	}
	}

	SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	if (Depth > 0) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Just load the return address.
	SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
	MachinePointerInfo());
	}

	SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
	return getReturnAddressFrameIndex(DAG);
	}

	SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	EVT VT = Op.getValueType();

	MFI.setFrameAddressIsTaken(true);

	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
	// Depth > 0 makes no sense on targets which use Windows unwind codes. It
	// is not possible to crawl up the stack without looking at the unwind codes
	// simultaneously.
	int FrameAddrIndex = FuncInfo->getFAIndex();
	if (!FrameAddrIndex) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
	SlotSize, /Offset=/0, /IsImmutable=/false);
	FuncInfo->setFAIndex(FrameAddrIndex);
	}
	return DAG.getFrameIndex(FrameAddrIndex, VT);
	}

	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	SDLoc dl(Op); // FIXME probably not meaningful
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	assert(((FrameReg == X86::RBP && VT == MVT::i64) \|\|
	(FrameReg == X86::EBP && VT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	const MachineFunction &MF = DAG.getMachineFunction();

	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("esp", X86::ESP)
	.Case("rsp", X86::RSP)
	.Case("ebp", X86::EBP)
	.Case("rbp", X86::RBP)
	.Default(0);

	if (Reg == X86::EBP \|\| Reg == X86::RBP) {
	if (!TFI.hasFP(MF))
	report_fatal_error("register " + StringRef(RegName) +
	" is allocatable: function has no frame pointer");
	#ifndef NDEBUG
	else {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	assert((FrameReg == X86::EBP \|\| FrameReg == X86::RBP) &&
	"Invalid Frame Register!");
	}
	#endif
	}

	if (Reg)
	return Reg;

	report_fatal_error("Invalid register name global variable");
	}

	SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
	SelectionDAG &DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
	}

	unsigned X86TargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

	return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
	}

	unsigned X86TargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	// Funclet personalities don't use selectors (the runtime does the selection).
	assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
	}

	bool X86TargetLowering::needsFixedCatchObjects() const {
	return Subtarget.isTargetWin64();
	}

	SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Offset = Op.getOperand(1);
	SDValue Handler = Op.getOperand(2);
	SDLoc dl (Op);

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
	assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\|
	(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
	unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

	SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
	DAG.getIntPtrConstant(RegInfo->getSlotSize(),
	dl));
	StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
	Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
	Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

	return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
	DAG.getRegister(StoreAddrReg, PtrVT));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	// If the subtarget is not 64bit, we may need the global base reg
	// after isel expand pseudo, i.e., after CGBR pass ran.
	// Therefore, ask for the GlobalBaseReg now, so that the pass
	// inserts the code for us in case we need it.
	// Otherwise, we will end up in a situation where we will
	// reference a virtual register that is not defined!
	if (!Subtarget.is64Bit()) {
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
	}
	return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
	Op.getOperand(0));
	}

	static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
	return Op.getOperand(0);
	}

	SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Root = Op.getOperand(0);
	SDValue Trmp = Op.getOperand(1); // trampoline
	SDValue FPtr = Op.getOperand(2); // nested function
	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
	SDLoc dl (Op);

	const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	if (Subtarget.is64Bit()) {
	SDValue OutChains[6];

	// Large code-model.
	const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
	const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

	const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
	const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

	const unsigned char REX_WB = 0x40 \| 0x08 \| 0x01; // REX prefix

	// Load the pointer to the nested function into R11.
	unsigned OpCode = ((MOV64ri \| N86R11) << 8) \| REX_WB; // movabsq r11
	SDValue Addr = Trmp;
	OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(2, dl, MVT::i64));
	OutChains[1] =
	DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
	/* Alignment = */ 2);

	// Load the 'nest' parameter value into R10.
	// R10 is specified in X86CallingConv.td
	OpCode = ((MOV64ri \| N86R10) << 8) \| REX_WB; // movabsq r10
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(10, dl, MVT::i64));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 10));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(12, dl, MVT::i64));
	OutChains[3] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
	/* Alignment = */ 2);

	// Jump to the nested function.
	OpCode = (JMP64r << 8) \| REX_WB; // jmpq *...
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(20, dl, MVT::i64));
	OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 20));

	unsigned char ModRM = N86R11 \| (4 << 3) \| (3 << 6); // ...r11
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(22, dl, MVT::i64));
	OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 22));

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	} else {
	const Function *Func =
	cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
	CallingConv::ID CC = Func->getCallingConv();
	unsigned NestReg;

	switch (CC) {
	default:
	llvm_unreachable("Unsupported calling convention");
	case CallingConv::C:
	case CallingConv::X86_StdCall: {
	// Pass 'nest' parameter in ECX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::ECX;

	// Check that ECX wasn't needed by an 'inreg' parameter.
	FunctionType *FTy = Func->getFunctionType();
	const AttributeList &Attrs = Func->getAttributes();

	if (!Attrs.isEmpty() && !Func->isVarArg()) {
	unsigned InRegCount = 0;
	unsigned Idx = 1;

	for (FunctionType::param_iterator I = FTy->param_begin(),
	E = FTy->param_end(); I != E; ++I, ++Idx)
	if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
	auto &DL = DAG.getDataLayout();
	// FIXME: should only count parameters that are lowered to integers.
	InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
	}

	if (InRegCount > 2) {
	report_fatal_error("Nest register in use - reduce number of inreg"
	" parameters!");
	}
	}
	break;
	}
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::Fast:
	// Pass 'nest' parameter in EAX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::EAX;
	break;
	}

	SDValue OutChains[4];
	SDValue Addr, Disp;

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(10, dl, MVT::i32));
	Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

	// This is storing the opcode for MOV32ri.
	const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
	const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
	OutChains[0] =
	DAG.getStore(Root, dl, DAG.getConstant(MOV32ri \| N86Reg, dl, MVT::i8),
	Trmp, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(1, dl, MVT::i32));
	OutChains[1] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
	/* Alignment = */ 1);

	const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(5, dl, MVT::i32));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 5),
	/* Alignment = */ 1);

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(6, dl, MVT::i32));
	OutChains[3] =
	DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
	/* Alignment = */ 1);

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}
	}

	SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	/*
	The rounding mode is in bits 11:10 of FPSR, and has the following
	settings:
	00 Round to nearest
	01 Round to -inf
	10 Round to +inf
	11 Round to 0

	FLT_ROUNDS, on the other hand, expects the following:
	-1 Undefined
	0 Round to 0
	1 Round to nearest
	2 Round to +inf
	3 Round to -inf

	To perform the conversion, we do:
	(((((FPSR & 0x800) >> 11) \| ((FPSR & 0x400) >> 9)) + 1) & 3)
	*/

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);

	// Save FP Control Word to stack slot
	int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
	SDValue StackSlot =
	DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOStore, 2, 2);

	SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
	SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
	DAG.getVTList(MVT::Other),
	Ops, MVT::i16, MMO);

	// Load FP Control Word from stack slot
	SDValue CWD =
	DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());

	// Transform as necessary
	SDValue CWD1 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x800, DL, MVT::i16)),
	DAG.getConstant(11, DL, MVT::i8));
	SDValue CWD2 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x400, DL, MVT::i16)),
	DAG.getConstant(9, DL, MVT::i8));

	SDValue RetVal =
	DAG.getNode(ISD::AND, DL, MVT::i16,
	DAG.getNode(ISD::ADD, DL, MVT::i16,
	DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
	DAG.getConstant(1, DL, MVT::i16)),
	DAG.getConstant(3, DL, MVT::i16));

	return DAG.getNode((VT.getSizeInBits() < 16 ?
	ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
	}

	// Split an unary integer op into 2 half sized ops.
	static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElems = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();

	// Extract the Lo/Hi vectors
	SDLoc dl(Op);
	SDValue Src = Op.getOperand(0);
	SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
	SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
	DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	// Decompose 512-bit ops into smaller 256-bit ops.
	static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is512BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 512-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
	//
	// i8/i16 vector implemented using dword LZCNT vector instruction
	// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
	// split the vector, perform operation on it's Lo a Hi part and
	// concatenate the results.
	static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::CTLZ);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = VT.getVectorNumElements();

	assert((EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&
	"Unsupported element type");

	// Split vector, it's Lo and Hi parts will be handled in next iteration.
	if (16 < NumElems)
	return LowerVectorIntUnary(Op, DAG);

	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	assert((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) &&
	"Unsupported value type for operation");

	// Use native supported vector instruction vplzcntd.
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
	SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
	SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
	SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

	return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
	}

	// Lower CTLZ using a PSHUFB lookup table implementation.
	static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	int NumElts = VT.getVectorNumElements();
	int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
	MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

	// Per-nibble leading zero PSHUFB lookup table.
	const int LUT[16] = {/* 0 / 4, / 1 / 3, / 2 / 2, / 3 */ 2,
	/* 4 / 1, / 5 / 1, / 6 / 1, / 7 */ 1,
	/* 8 / 0, / 9 / 0, / a / 0, / b */ 0,
	/* c / 0, / d / 0, / e / 0, / f */ 0};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumBytes; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

	// Begin by bitcasting the input to byte vector, then split those bytes
	// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
	// If the hi input nibble is zero then we add both results together, otherwise
	// we just take the hi result (by masking the lo result to zero before the
	// add).
	SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
	SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);

	SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
	SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
	SDValue HiZ;
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
	}

	Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
	Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
	SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

	// Merge result back from vXi8 back to VT, working on the lo/hi halves
	// of the current vector width in the same way we did for the nibbles.
	// If the upper half of the input element is zero then add the halves'
	// leading zero counts together, otherwise just use the upper half's.
	// Double the width of the result until we are at target width.
	while (CurrVT != VT) {
	int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
	int CurrNumElts = CurrVT.getVectorNumElements();
	MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
	MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
	SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

	// Check if the upper half of the input element is zero.
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	}
	HiZ = DAG.getBitcast(NextVT, HiZ);

	// Move the upper/lower halves to the lower bits as we'll be extending to
	// NextVT. Mask the lower result to zero if HiZ is true and add the results
	// together.
	SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
	SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
	SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
	R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
	Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
	CurrVT = NextVT;
	}

	return Res;
	}

	static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasCDI())
	return LowerVectorCTLZ_AVX512CDI(Op, DAG);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
	return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
	}

	static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT OpVT = VT;
	unsigned NumBits = VT.getSizeInBits();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	if (VT.isVector())
	return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

	Op = Op.getOperand(0);
	if (VT == MVT::i8) {
	// Zero extend to i32 since there is not an i8 bsr.
	OpVT = MVT::i32;
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
	}

	// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

	if (Opc == ISD::CTLZ) {
	// If src is zero (i.e. bsr sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
	}

	// Finally xor with NumBits-1.
	Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
	DAG.getConstant(NumBits - 1, dl, OpVT));

	if (VT == MVT::i8)
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
	return Op;
	}

	static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumBits = VT.getScalarSizeInBits();
	SDLoc dl(Op);

	if (VT.isVector()) {
	SDValue N0 = Op.getOperand(0);
	SDValue Zero = DAG.getConstant(0, dl, VT);

	// lsb(x) = (x & -x)
	SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
	DAG.getNode(ISD::SUB, dl, VT, Zero, N0));

	// cttz_undef(x) = (width - 1) - ctlz(lsb)
	if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
	SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
	return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
	DAG.getNode(ISD::CTLZ, dl, VT, LSB));
	}

	// cttz(x) = ctpop(lsb - 1)
	SDValue One = DAG.getConstant(1, dl, VT);
	return DAG.getNode(ISD::CTPOP, dl, VT,
	DAG.getNode(ISD::SUB, dl, VT, LSB, One));
	}

	assert(Op.getOpcode() == ISD::CTTZ &&
	"Only scalar CTTZ requires custom lowering");

	// Issue a bsf (scan bits forward) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));

	// If src is zero (i.e. bsf sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits, dl, VT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
	}

	/// Break a 256-bit integer operation into two new 128-bit ones and then
	/// concatenate the result back.
	static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	/// Break a 512-bit integer operation into two new 256-bit ones and then
	/// concatenate the result back.
	static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is512BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
	Op.getOperand(0), Op.getOperand(1));
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntArith(Op, DAG);
	}

	static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64) {
	// Since X86 does not have CMOV for 8-bit integer, we don't convert
	// 8-bit integer abs to NEG and CMOV.
	SDLoc DL(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
	DAG.getConstant(0, DL, VT), N0);
	SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
	SDValue(Neg.getNode(), 1)};
	return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
	}

	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntUnary(Op, DAG);
	}

	static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntArith(Op, DAG);
	}

	static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntArith(Op, DAG);

	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
	// vector pairs, multiply and truncate.
	if (VT == MVT::v16i8 \|\| VT == MVT::v32i8 \|\| VT == MVT::v64i8) {
	if (Subtarget.hasInt256()) {
	// For 512-bit vectors, split into 256-bit vectors to allow the
	// sign-extension to occur.
	if (VT == MVT::v64i8)
	return Lower512IntArith(Op, DAG);

	// For 256-bit vectors, split into 128-bit vectors to allow the
	// sign-extension to occur. We don't need this on AVX512BW as we can
	// safely sign-extend to v32i16.
	if (VT == MVT::v32i8 && !Subtarget.hasBWI())
	return Lower256IntArith(Op, DAG);

	MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	return DAG.getNode(
	ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::MUL, dl, ExVT,
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
	}

	assert(VT == MVT::v16i8 &&
	"Pre-AVX2 support only supports v16i8 multiplication");
	MVT ExVT = MVT::v8i16;

	// Extract the lo parts and sign extend to i16
	SDValue ALo, BLo;
	if (Subtarget.hasSSE41()) {
	ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
	BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
	} else {
	const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
	-1, 4, -1, 5, -1, 6, -1, 7};
	ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	ALo = DAG.getBitcast(ExVT, ALo);
	BLo = DAG.getBitcast(ExVT, BLo);
	ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
	BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
	}

	// Extract the hi parts and sign extend to i16
	SDValue AHi, BHi;
	if (Subtarget.hasSSE41()) {
	const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
	BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
	} else {
	const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
	-1, 12, -1, 13, -1, 14, -1, 15};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getBitcast(ExVT, AHi);
	BHi = DAG.getBitcast(ExVT, BHi);
	AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
	BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
	}

	// Multiply, mask the lower 8bits of the lo/hi results and pack
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
	RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
	if (VT == MVT::v4i32) {
	assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
	"Should not custom lower when pmulld is available!");

	// If the upper 17 bits of each element are zero then we can use PMADD.
	APInt Mask17 = APInt::getHighBitsSet(32, 17);
	if (DAG.MaskedValueIsZero(A, Mask17) && DAG.MaskedValueIsZero(B, Mask17))
	return DAG.getNode(X86ISD::VPMADDWD, dl, VT,
	DAG.getBitcast(MVT::v8i16, A),
	DAG.getBitcast(MVT::v8i16, B));

	// Extract the odd parts.
	static const int UnpackMask[] = { 1, -1, 3, -1 };
	SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
	SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

	// Multiply the even parts.
	SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
	// Now multiply odd parts.
	SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);

	Evens = DAG.getBitcast(VT, Evens);
	Odds = DAG.getBitcast(VT, Odds);

	// Merge the two vectors back together with a shuffle. This expands into 2
	// shuffles.
	static const int ShufMask[] = { 0, 4, 2, 6 };
	return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
	}

	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&
	"Only know how to lower V2I64/V4I64/V8I64 multiply");

	// 32-bit vector types used for MULDQ/MULUDQ.
	MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

	// MULDQ returns the 64-bit result of the signed multiplication of the lower
	// 32-bits. We can lower with this if the sign bits stretch that far.
	if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
	DAG.ComputeNumSignBits(B) > 32) {
	return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
	DAG.getBitcast(MulVT, B));
	}

	// Ahi = psrlqi(a, 32);
	// Bhi = psrlqi(b, 32);
	//
	// AloBlo = pmuludq(a, b);
	// AloBhi = pmuludq(a, Bhi);
	// AhiBlo = pmuludq(Ahi, b);
	//
	// Hi = psllqi(AloBhi + AhiBlo, 32);
	// return AloBlo + Hi;
	APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
	bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
	bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);

	APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
	bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
	bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);

	// If DQI is supported we can use MULLQ, but MULUDQ is still better if the
	// the high bits are known to be zero.
	if (Subtarget.hasDQI() && (!AHiIsZero \|\| !BHiIsZero))
	return Op;

	// Bit cast to 32-bit vectors for MULUDQ.
	SDValue Alo = DAG.getBitcast(MulVT, A);
	SDValue Blo = DAG.getBitcast(MulVT, B);

	SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);

	// Only multiply lo/hi halves that aren't known to be zero.
	SDValue AloBlo = Zero;
	if (!ALoIsZero && !BLoIsZero)
	AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);

	SDValue AloBhi = Zero;
	if (!ALoIsZero && !BHiIsZero) {
	SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
	Bhi = DAG.getBitcast(MulVT, Bhi);
	AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
	}

	SDValue AhiBlo = Zero;
	if (!AHiIsZero && !BLoIsZero) {
	SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
	Ahi = DAG.getBitcast(MulVT, Ahi);
	AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
	}

	SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
	Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

	return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
	}

	static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntArith(Op, DAG);

	// Only i8 vectors should need custom lowering after this.
	assert((VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
	"Unsupported vector type");

	// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
	// logical shift down the upper half and pack back to i8.
	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
	// and then ashr/lshr the upper bits down to the lower bits before multiply.
	unsigned Opcode = Op.getOpcode();
	unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
	unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);

	// For 512-bit vectors, split into 256-bit vectors to allow the
	// sign-extension to occur.
	if (VT == MVT::v64i8)
	return Lower512IntArith(Op, DAG);

	// AVX2 implementations - extend xmm subvectors to ymm.
	if (Subtarget.hasInt256()) {
	unsigned NumElems = VT.getVectorNumElements();
	SDValue Lo = DAG.getIntPtrConstant(0, dl);
	SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);

	if (VT == MVT::v32i8) {
	if (Subtarget.hasBWI()) {
	SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
	SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
	Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
	DAG.getConstant(8, dl, MVT::v32i16));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
	}
	SDValue ALo = extract128BitVector(A, 0, DAG, dl);
	SDValue BLo = extract128BitVector(B, 0, DAG, dl);
	SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
	SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
	ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
	BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
	AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
	BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
	Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
	DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
	DAG.getConstant(8, dl, MVT::v16i16));
	Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
	DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
	DAG.getConstant(8, dl, MVT::v16i16));
	// The ymm variant of PACKUS treats the 128-bit lanes separately, so before
	// using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
	const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
	16, 17, 18, 19, 20, 21, 22, 23};
	const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	24, 25, 26, 27, 28, 29, 30, 31};
	return DAG.getNode(X86ISD::PACKUS, dl, VT,
	DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
	DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
	}

	assert(VT == MVT::v16i8 && "Unexpected VT");

	SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
	SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
	Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
	DAG.getConstant(8, dl, MVT::v16i16));
	// If we have BWI we can use truncate instruction.
	if (Subtarget.hasBWI())
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
	Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	assert(VT == MVT::v16i8 &&
	"Pre-AVX2 support only supports v16i8 multiplication");
	MVT ExVT = MVT::v8i16;
	unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);

	// Extract the lo parts and zero/sign extend to i16.
	SDValue ALo, BLo;
	if (Subtarget.hasSSE41()) {
	ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
	BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
	} else {
	const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
	-1, 4, -1, 5, -1, 6, -1, 7};
	ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	ALo = DAG.getBitcast(ExVT, ALo);
	BLo = DAG.getBitcast(ExVT, BLo);
	ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
	BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
	}

	// Extract the hi parts and zero/sign extend to i16.
	SDValue AHi, BHi;
	if (Subtarget.hasSSE41()) {
	const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
	BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
	} else {
	const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
	-1, 12, -1, 13, -1, 14, -1, 15};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getBitcast(ExVT, AHi);
	BHi = DAG.getBitcast(ExVT, BHi);
	AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
	BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
	}

	// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
	// pack back to v16i8.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
	RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.isTargetWin64() && "Unexpected target");
	EVT VT = Op.getValueType();
	assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
	"Unexpected return type for lowering");

	RTLIB::Libcall LC;
	bool isSigned;
	switch (Op->getOpcode()) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
	case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
	case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
	case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
	case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
	case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
	}

	SDLoc dl(Op);
	SDValue InChain = DAG.getEntryNode();

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
	EVT ArgVT = Op->getOperand(i).getValueType();
	assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
	"Unexpected argument type for lowering");
	SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
	Entry.Node = StackPtr;
	InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
	MachinePointerInfo(), /* Alignment = */ 16);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Ty = PointerType::get(ArgTy,0);
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);
	}

	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setLibCallee(
	getLibcallCallingConv(LC),
	static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
	std::move(Args))
	.setInRegister()
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
	return DAG.getBitcast(VT, CallInfo.first);
	}

	static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
	MVT VT = Op0.getSimpleValueType();
	SDLoc dl(Op);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256()) {
	unsigned Opcode = Op.getOpcode();
	unsigned NumElems = VT.getVectorNumElements();
	MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
	SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
	SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
	SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
	SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
	SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
	SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
	SDValue Ops[] = {
	DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
	DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
	};
	return DAG.getMergeValues(Ops, dl);
	}

	assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v16i32 && Subtarget.hasAVX512()));

	int NumElts = VT.getVectorNumElements();

	// PMULxD operations multiply each even value (starting at 0) of LHS with
	// the related value of RHS and produce a widen result.
	// E.g., PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	//
	// In other word, to have all the results, we need to perform two PMULxD:
	// 1. one with the even values.
	// 2. one with the odd values.
	// To achieve #2, with need to place the odd values at an even position.
	//
	// Place the odd value at an even position (basically, shift all values 1
	// step to the left):
	const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
	// <a\|b\|c\|d> => <b\|undef\|d\|undef>
	SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
	makeArrayRef(&Mask[0], NumElts));
	// <e\|f\|g\|h> => <f\|undef\|h\|undef>
	SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
	makeArrayRef(&Mask[0], NumElts));

	// Emit two multiplies, one for the lower 2 ints and one for the higher 2
	// ints.
	MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
	bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
	unsigned Opcode =
	(!IsSigned \|\| !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
	// PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
	// PMULUDQ <4 x i32> <b\|undef\|d\|undef>, <4 x i32> <f\|undef\|h\|undef>
	// => <2 x i64> <bf\|dh>
	SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));

	// Shuffle it back into the right order.
	SmallVector<int, 16> HighMask(NumElts);
	SmallVector<int, 16> LowMask(NumElts);
	for (int i = 0; i != NumElts; ++i) {
	HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
	LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
	}

	SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
	SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);

	// If we have a signed multiply but no PMULDQ fix up the high parts of a
	// unsigned multiply.
	if (IsSigned && !Subtarget.hasSSE41()) {
	SDValue ShAmt = DAG.getConstant(
	31, dl,
	DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
	SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
	SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);

	SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
	Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
	}

	// The first result of MUL_LOHI is actually the low value, followed by the
	// high value.
	SDValue Ops[] = {Lows, Highs};
	return DAG.getMergeValues(Ops, dl);
	}

	// Return true if the required (according to Opcode) shift-imm form is natively
	// supported by the Subtarget
	static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	if (VT.getScalarSizeInBits() < 16)
	return false;

	if (VT.is512BitVector() && Subtarget.hasAVX512() &&
	(VT.getScalarSizeInBits() > 16 \|\| Subtarget.hasBWI()))
	return true;

	bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256());

	bool AShift = LShift && (Subtarget.hasAVX512() \|\|
	(VT != MVT::v2i64 && VT != MVT::v4i64));
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	// The shift amount is a variable, but it is the same for all vector lanes.
	// These instructions are defined together with shift-immediate.
	static
	bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
	}

	// Return true if the required (according to Opcode) variable-shift form is
	// natively supported by the Subtarget
	static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {

	if (!Subtarget.hasInt256() \|\| VT.getScalarSizeInBits() < 16)
	return false;

	// vXi16 supported only on AVX-512, BWI
	if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
	return false;

	if (Subtarget.hasAVX512())
	return true;

	bool LShift = VT.is128BitVector() \|\| VT.is256BitVector();
	bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);

	unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;

	auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && "Unexpected SRA type");
	MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
	SDValue Ex = DAG.getBitcast(ExVT, R);

	// ashr(R, 63) === cmp_slt(R, 0)
	if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
	assert((VT != MVT::v4i64 \|\| Subtarget.hasInt256()) &&
	"Unsupported PCMPGT op");
	return DAG.getNode(X86ISD::PCMPGT, dl, VT,
	getZeroVector(VT, Subtarget, DAG, dl), R);
	}

	if (ShiftAmt >= 32) {
	// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
	SDValue Upper =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
	SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt - 32, DAG);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{9, 1, 11, 3, 13, 5, 15, 7});
	} else {
	// SRA upper i32, SHL whole i64 and select lower i32.
	SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt, DAG);
	SDValue Lower =
	getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
	Lower = DAG.getBitcast(ExVT, Lower);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{8, 1, 10, 3, 12, 5, 14, 7});
	}
	return DAG.getBitcast(VT, Ex);
	};

	// Optimize shl/srl/sra with constant shift amount.
	if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
	if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
	uint64_t ShiftAmt = ShiftConst->getZExtValue();

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	// i64 SRA needs to be performed as partial shifts.
	if (((!Subtarget.hasXOP() && VT == MVT::v2i64) \|\|
	(Subtarget.hasInt256() && VT == MVT::v4i64)) &&
	Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);

	if (VT == MVT::v16i8 \|\|
	(Subtarget.hasInt256() && VT == MVT::v32i8) \|\|
	VT == MVT::v64i8) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Simple i8 add case
	if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
	return DAG.getNode(ISD::ADD, dl, VT, R, R);

	// ashr(R, 7) === cmp_slt(R, 0)
	if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
	SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
	if (VT.is512BitVector()) {
	assert(VT == MVT::v64i8 && "Unexpected element type!");
	SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
	}
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
	}

	// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
	if (VT == MVT::v16i8 && Subtarget.hasXOP())
	return SDValue();

	if (Op.getOpcode() == ISD::SHL) {
	// Make a large shift.
	SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
	R, ShiftAmt, DAG);
	SHL = DAG.getBitcast(VT, SHL);
	// Zero out the rightmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SHL,
	DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
	}
	if (Op.getOpcode() == ISD::SRL) {
	// Make a large shift.
	SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
	R, ShiftAmt, DAG);
	SRL = DAG.getBitcast(VT, SRL);
	// Zero out the leftmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SRL,
	DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
	SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

	SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
	return Res;
	}
	llvm_unreachable("Unknown shift opcode.");
	}
	}
	}

	// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
	// TODO: Replace constant extraction with getTargetConstantBitsFromNode.
	if (!Subtarget.hasXOP() &&
	(VT == MVT::v2i64 \|\| (Subtarget.hasInt256() && VT == MVT::v4i64) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v8i64))) {

	// AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
	unsigned SubVectorScale = 1;
	if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	SubVectorScale =
	Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
	Amt = Amt.getOperand(0);
	}

	// Peek through any splat that was introduced for i64 shift vectorization.
	int SplatIndex = -1;
	if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
	if (SVN->isSplat()) {
	SplatIndex = SVN->getSplatIndex();
	Amt = Amt.getOperand(0);
	assert(SplatIndex < (int)VT.getVectorNumElements() &&
	"Splat shuffle referencing second operand");
	}

	if (Amt.getOpcode() != ISD::BITCAST \|\|
	Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	Amt = Amt.getOperand(0);
	unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
	(SubVectorScale * VT.getVectorNumElements());
	unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
	uint64_t ShiftAmt = 0;
	unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
	for (unsigned i = 0; i != Ratio; ++i) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
	if (!C)
	return SDValue();
	// 6 == Log2(64)
	ShiftAmt \|= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
	}

	// Check remaining shift amounts (if not a splat).
	if (SplatIndex < 0) {
	for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
	uint64_t ShAmt = 0;
	for (unsigned j = 0; j != Ratio; ++j) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
	if (!C)
	return SDValue();
	// 6 == Log2(64)
	ShAmt \|= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
	}
	if (ShAmt != ShiftAmt)
	return SDValue();
	}
	}

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	if (Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);
	}

	return SDValue();
	}

	static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);

	unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;

	unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
	SDValue BaseShAmt;
	MVT EltVT = VT.getVectorElementType();

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
	// Check if this build_vector node is doing a splat.
	// If so, then set BaseShAmt equal to the splat value.
	BaseShAmt = BV->getSplatValue();
	if (BaseShAmt && BaseShAmt.isUndef())
	BaseShAmt = SDValue();
	} else {
	if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
	Amt = Amt.getOperand(0);

	ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
	if (SVN && SVN->isSplat()) {
	unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
	SDValue InVec = Amt.getOperand(0);
	if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
	assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
	"Unexpected shuffle index found!");
	BaseShAmt = InVec.getOperand(SplatIdx);
	} else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
	if (ConstantSDNode *C =
	dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
	if (C->getZExtValue() == SplatIdx)
	BaseShAmt = InVec.getOperand(1);
	}
	}

	if (!BaseShAmt)
	// Avoid introducing an extract element from a shuffle.
	BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
	DAG.getIntPtrConstant(SplatIdx, dl));
	}
	}

	if (BaseShAmt.getNode()) {
	assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
	if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
	else if (EltVT.bitsLT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
	}
	}

	// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
	if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
	Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
	Amt = Amt.getOperand(0);
	unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
	VT.getVectorNumElements();
	std::vector<SDValue> Vals(Ratio);
	for (unsigned i = 0; i != Ratio; ++i)
	Vals[i] = Amt.getOperand(i);
	for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
	for (unsigned j = 0; j != Ratio; ++j)
	if (Vals[j] != Amt.getOperand(i + j))
	return SDValue();
	}

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
	return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
	}
	return SDValue();
	}

	static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	assert(VT.isVector() && "Custom lowering only for vector shifts!");
	assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");

	if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
	return V;

	if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
	return V;

	if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
	return Op;

	// XOP has 128-bit variable logical/arithmetic shifts.
	// +ve/-ve Amt = shift left/right.
	if (Subtarget.hasXOP() &&
	(VT == MVT::v2i64 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v8i16 \|\| VT == MVT::v16i8)) {
	if (Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SRA) {
	SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
	Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
	}
	if (Op.getOpcode() == ISD::SHL \|\| Op.getOpcode() == ISD::SRL)
	return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
	if (Op.getOpcode() == ISD::SRA)
	return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
	}

	// 2i64 vector logical shifts can efficiently avoid scalarization - do the
	// shifts per-lane and then shuffle the partial results back together.
	if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
	// Splat the shift amounts so the scalar shifts above will catch it.
	SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
	SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
	SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
	return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
	}

	// i64 vector arithmetic shift can be emulated with the transform:
	// M = lshr(SIGN_MASK, Amt)
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
	if ((VT == MVT::v2i64 \|\| (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
	Op.getOpcode() == ISD::SRA) {
	SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
	SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
	R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
	R = DAG.getNode(ISD::XOR, dl, VT, R, M);
	R = DAG.getNode(ISD::SUB, dl, VT, R, M);
	return R;
	}

	// If possible, lower this packed shift into a vector multiply instead of
	// expanding it into a sequence of scalar shifts.
	// Do this only if the vector shift count is a constant build_vector.
	if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
	(VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(Subtarget.hasInt256() && VT == MVT::v16i16))) {
	SmallVector<SDValue, 8> Elts;
	MVT SVT = VT.getVectorElementType();
	unsigned SVTBits = SVT.getSizeInBits();
	APInt One(SVTBits, 1);
	unsigned NumElems = VT.getVectorNumElements();

	for (unsigned i=0; i !=NumElems; ++i) {
	SDValue Op = Amt->getOperand(i);
	if (Op->isUndef()) {
	Elts.push_back(Op);
	continue;
	}

	ConstantSDNode *ND = cast<ConstantSDNode>(Op);
	APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
	uint64_t ShAmt = C.getZExtValue();
	if (ShAmt >= SVTBits) {
	Elts.push_back(DAG.getUNDEF(SVT));
	continue;
	}
	Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
	}
	SDValue BV = DAG.getBuildVector(VT, dl, Elts);
	return DAG.getNode(ISD::MUL, dl, VT, R, BV);
	}

	// Lower SHL with variable shift amount.
	if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
	Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

	Op = DAG.getNode(ISD::ADD, dl, VT, Op,
	DAG.getConstant(0x3f800000U, dl, VT));
	Op = DAG.getBitcast(MVT::v4f32, Op);
	Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
	return DAG.getNode(ISD::MUL, dl, VT, Op, R);
	}

	// If possible, lower this shift as a sequence of two shifts by
	// constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
	// Example:
	// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
	//
	// Could be rewritten as:
	// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
	//
	// The advantage is that the two shifts from the example would be
	// lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
	// the vector shift into four scalar shifts plus four pairs of vector
	// insert/extract.
	if (ConstantAmt && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) {
	bool UseMOVSD = false;
	bool CanBeSimplified;
	// The splat value for the first packed shift (the 'X' from the example).
	SDValue Amt1 = Amt->getOperand(0);
	// The splat value for the second packed shift (the 'Y' from the example).
	SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);

	// See if it is possible to replace this node with a sequence of
	// two shifts followed by a MOVSS/MOVSD/PBLEND.
	if (VT == MVT::v4i32) {
	// Check if it is legal to use a MOVSS.
	CanBeSimplified = Amt2 == Amt->getOperand(2) &&
	Amt2 == Amt->getOperand(3);
	if (!CanBeSimplified) {
	// Otherwise, check if we can still simplify this node using a MOVSD.
	CanBeSimplified = Amt1 == Amt->getOperand(1) &&
	Amt->getOperand(2) == Amt->getOperand(3);
	UseMOVSD = true;
	Amt2 = Amt->getOperand(2);
	}
	} else {
	// Do similar checks for the case where the machine value type
	// is MVT::v8i16.
	CanBeSimplified = Amt1 == Amt->getOperand(1);
	for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
	CanBeSimplified = Amt2 == Amt->getOperand(i);

	if (!CanBeSimplified) {
	UseMOVSD = true;
	CanBeSimplified = true;
	Amt2 = Amt->getOperand(4);
	for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
	CanBeSimplified = Amt1 == Amt->getOperand(i);
	for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
	CanBeSimplified = Amt2 == Amt->getOperand(j);
	}
	}

	if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
	isa<ConstantSDNode>(Amt2)) {
	// Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
	SDValue Splat1 =
	DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
	SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
	SDValue Splat2 =
	DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
	SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
	SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
	SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
	if (UseMOVSD)
	return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
	BitCast2, {0, 1, 6, 7}));
	return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
	BitCast2, {0, 5, 6, 7}));
	}
	}

	// v4i32 Non Uniform Shifts.
	// If the shift amount is constant we can shift each lane using the SSE2
	// immediate shifts, else we need to zero-extend each lane to the lower i64
	// and shift using the SSE2 variable shifts.
	// The separate results can then be blended together.
	if (VT == MVT::v4i32) {
	unsigned Opc = Op.getOpcode();
	SDValue Amt0, Amt1, Amt2, Amt3;
	if (ConstantAmt) {
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
	} else {
	// ISD::SHL is handled above but we include it here for completeness.
	switch (Opc) {
	default:
	llvm_unreachable("Unknown target vector shift node");
	case ISD::SHL:
	Opc = X86ISD::VSHL;
	break;
	case ISD::SRL:
	Opc = X86ISD::VSRL;
	break;
	case ISD::SRA:
	Opc = X86ISD::VSRA;
	break;
	}
	// The SSE2 shifts use the lower i64 as the same shift amount for
	// all lanes and the upper i64 is ignored. These shuffle masks
	// optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
	SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
	}

	SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
	SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
	SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
	SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
	SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
	}

	// It's worth extending once and using the vXi16/vXi32 shifts for smaller
	// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
	// make the existing SSE solution better.
	if ((Subtarget.hasInt256() && VT == MVT::v8i16) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v16i16) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v16i8) \|\|
	(Subtarget.hasBWI() && VT == MVT::v32i8)) {
	assert((!Subtarget.hasBWI() \|\| VT == MVT::v32i8 \|\| VT == MVT::v16i8) &&
	"Unexpected vector type");
	MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
	MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
	unsigned ExtOpc =
	Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	R = DAG.getNode(ExtOpc, dl, ExtVT, R);
	Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
	}

	if (VT == MVT::v16i8 \|\|
	(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) {
	MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
	unsigned ShiftOpcode = Op->getOpcode();

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (VT.is512BitVector()) {
	// On AVX512BW targets we make use of the fact that VSELECT lowers
	// to a masked blend which selects bytes based just on the sign bit
	// extracted to a mask.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	} else if (Subtarget.hasSSE41()) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
	return DAG.getSelect(dl, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
	Amt = DAG.getBitcast(VT, Amt);

	if (Op->getOpcode() == ISD::SHL \|\| Op->getOpcode() == ISD::SRL) {
	// r = VSELECT(r, shift(r, 4), a);
	SDValue M =
	DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);
	return R;
	}

	if (Op->getOpcode() == ISD::SRA) {
	// For SRA we need to unpack each byte to the higher byte of a i16 vector
	// so we can correctly sign extend. We don't care what happens to the
	// lower byte.
	SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
	SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);

	// r = VSELECT(r, shift(r, 4), a);
	SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(4, dl, ExtVT));
	SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(4, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 2), a);
	MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(2, dl, ExtVT));
	MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(2, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 1), a);
	MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(1, dl, ExtVT));
	MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(1, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// Logical shift the result back to the lower byte, leaving a zero upper
	// byte
	// meaning that we can safely pack with PACKUSWB.
	RLo =
	DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
	RHi =
	DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}
	}

	if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
	MVT ExtVT = MVT::v8i32;
	SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
	SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
	SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
	SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
	SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);
	SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
	SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
	Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
	Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	if (VT == MVT::v8i16) {
	unsigned ShiftOpcode = Op->getOpcode();

	// If we have a constant shift amount, the non-SSE41 path is best as
	// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
	bool UseSSE41 = Subtarget.hasSSE41() &&
	!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	if (UseSSE41) {
	MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
	V0 = DAG.getBitcast(ExtVT, V0);
	V1 = DAG.getBitcast(ExtVT, V1);
	Sel = DAG.getBitcast(ExtVT, Sel);
	return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we splat the sign bit - a negative value will
	// set all bits of the lanes to true and VSELECT uses that in
	// its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue C =
	DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
	return DAG.getSelect(dl, VT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
	if (UseSSE41) {
	// On SSE41 targets we need to replicate the shift mask in both
	// bytes for PBLENDVB.
	Amt = DAG.getNode(
	ISD::OR, dl, VT,
	DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
	DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
	} else {
	Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
	}

	// r = VSELECT(r, shift(r, 8), a);
	SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 4), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(Amt, M, R);
	return R;
	}

	// Decompose 256-bit shifts into smaller 128-bit shifts.
	if (VT.is256BitVector())
	return Lower256IntArith(Op, DAG);

	return SDValue();
	}

	static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();

	if (Subtarget.hasAVX512()) {
	// Attempt to rotate by immediate.
	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
	if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
	return EltBits[0] == V;
	})) {
	unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
	uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
	return DAG.getNode(Op, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}
	}

	// Else, fall-back on VPROLV/VPRORV.
	return Op;
	}

	assert(VT.isVector() && "Custom lowering only for vector rotates!");
	assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
	assert((Opcode == ISD::ROTL) && "Only ROTL supported");

	// XOP has 128-bit vector variable + immediate rotates.
	// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.

	// Split 256-bit integers.
	if (VT.is256BitVector())
	return Lower256IntArith(Op, DAG);

	assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");

	// Attempt to rotate by immediate.
	if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
	if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
	uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
	assert(RotateAmt < EltSizeInBits && "Rotation out of range");
	return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}
	}

	// Use general rotate by variable (per-element).
	return Op;
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
	// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
	// looks for this combo and may remove the "setcc" instruction if the "setcc"
	// has only one use.
	SDNode *N = Op.getNode();
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	unsigned BaseOp = 0;
	X86::CondCode Cond;
	SDLoc DL(Op);
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown ovf instruction!");
	case ISD::SADDO:
	// A subtract of one will be selected as a INC. Note that INC doesn't
	// set CF, so we can't do this for UADDO.
	if (isOneConstant(RHS)) {
	BaseOp = X86ISD::INC;
	Cond = X86::COND_O;
	break;
	}
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_O;
	break;
	case ISD::UADDO:
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_B;
	break;
	case ISD::SSUBO:
	// A subtract of one will be selected as a DEC. Note that DEC doesn't
	// set CF, so we can't do this for USUBO.
	if (isOneConstant(RHS)) {
	BaseOp = X86ISD::DEC;
	Cond = X86::COND_O;
	break;
	}
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_O;
	break;
	case ISD::USUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_B;
	break;
	case ISD::SMULO:
	BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
	Cond = X86::COND_O;
	break;
	case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
	if (N->getValueType(0) == MVT::i8) {
	BaseOp = X86ISD::UMUL8;
	Cond = X86::COND_O;
	break;
	}
	SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
	MVT::i32);
	SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);

	SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);

	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}
	}

	// Also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
	SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

	SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);

	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	/// Returns true if the operand type is exactly twice the native width, and
	/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
	/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
	/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
	bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
	unsigned OpWidth = MemType->getPrimitiveSizeInBits();

	if (OpWidth == 64)
	return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
	else if (OpWidth == 128)
	return Subtarget.hasCmpxchg16b();
	else
	return false;
	}

	bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	return needsCmpXchgNb(SI->getValueOperand()->getType());
	}

	// Note: this turns large loads into lock cmpxchg8b/16b.
	// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	auto PTy = cast<PointerType>(LI->getPointerOperandType());
	return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();

	// If the operand is too big, we must see if cmpxchg8/16b is available
	// and default to library calls otherwise.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	AtomicRMWInst::BinOp Op = AI->getOperation();
	switch (Op) {
	default:
	llvm_unreachable("Unknown atomic operation");
	case AtomicRMWInst::Xchg:
	case AtomicRMWInst::Add:
	case AtomicRMWInst::Sub:
	// It's better to use xadd, xsub or xchg for these in all cases.
	return AtomicExpansionKind::None;
	case AtomicRMWInst::Or:
	case AtomicRMWInst::And:
	case AtomicRMWInst::Xor:
	// If the atomicrmw's result isn't actually used, we can just add a "lock"
	// prefix to a normal instruction for these operations.
	return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	case AtomicRMWInst::Nand:
	case AtomicRMWInst::Max:
	case AtomicRMWInst::Min:
	case AtomicRMWInst::UMax:
	case AtomicRMWInst::UMin:
	// These always require a non-trivial set of data operations on x86. We must
	// use a cmpxchg loop.
	return AtomicExpansionKind::CmpXChg;
	}
	}

	LoadInst *
	X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();
	// Accesses larger than the native width are turned into cmpxchg/libcalls, so
	// there is no benefit in turning such RMWs into loads, and it is actually
	// harmful as it introduces a mfence.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth)
	return nullptr;

	auto Builder = IRBuilder<>(AI);
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	auto SSID = AI->getSyncScopeID();
	// We must restrict the ordering to avoid generating loads with Release or
	// ReleaseAcquire orderings.
	auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
	auto Ptr = AI->getPointerOperand();

	// Before the load we need a fence. Here is an example lifted from
	// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
	// is required:
	// Thread 0:
	// x.store(1, relaxed);
	// r1 = y.fetch_add(0, release);
	// Thread 1:
	// y.fetch_add(42, acquire);
	// r2 = x.load(relaxed);
	// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
	// lowered to just a load without a fence. A mfence flushes the store buffer,
	// making the optimization clearly correct.
	// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
	// otherwise, we might be able to be more aggressive on relaxed idempotent
	// rmw. In practice, they do not look useful, so we don't try to be
	// especially clever.
	if (SSID == SyncScope::SingleThread)
	// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
	// the IR level, so we must wrap it in an intrinsic.
	return nullptr;

	if (!Subtarget.hasMFence())
	// FIXME: it might make sense to use a locked operation here but on a
	// different cache-line to prevent cache-line bouncing. In practice it
	// is probably a small win, and x86 processors without mfence are rare
	// enough that we do not bother.
	return nullptr;

	Function *MFence =
	llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
	Builder.CreateCall(MFence, {});

	// Finally we can emit the atomic load.
	LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
	AI->getType()->getPrimitiveSizeInBits());
	Loaded->setAtomic(Order, SSID);
	AI->replaceAllUsesWith(Loaded);
	AI->eraseFromParent();
	return Loaded;
	}

	static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
	cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
	SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// The only fence that needs an instruction is a sequentially-consistent
	// cross-thread fence.
	if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
	FenceSSID == SyncScope::System) {
	if (Subtarget.hasMFence())
	return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

	SDValue Chain = Op.getOperand(0);
	SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::ESP, MVT::i32), // Base
	DAG.getTargetConstant(1, dl, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i32), // Index
	DAG.getTargetConstant(0, dl, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i32), // Segment.
	Zero,
	Chain
	};
	SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
	}

	static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT T = Op.getSimpleValueType();
	SDLoc DL(Op);
	unsigned Reg = 0;
	unsigned size = 0;
	switch(T.SimpleTy) {
	default: llvm_unreachable("Invalid value type!");
	case MVT::i8: Reg = X86::AL; size = 1; break;
	case MVT::i16: Reg = X86::AX; size = 2; break;
	case MVT::i32: Reg = X86::EAX; size = 4; break;
	case MVT::i64:
	assert(Subtarget.is64Bit() && "Node not type legal!");
	Reg = X86::RAX; size = 8;
	break;
	}
	SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
	Op.getOperand(2), SDValue());
	SDValue Ops[] = { cpIn.getValue(0),
	Op.getOperand(1),
	Op.getOperand(3),
	DAG.getTargetConstant(size, DL, MVT::i8),
	cpIn.getValue(1) };
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
	Ops, T, MMO);

	SDValue cpOut =
	DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
	SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
	MVT::i32, cpOut.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

	DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
	return SDValue();
	}

	static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SrcVT = Op.getOperand(0).getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	if (SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8 \|\|
	SrcVT == MVT::i64) {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	if (DstVT != MVT::f64)
	// This conversion needs to be expanded.
	return SDValue();

	SDValue Op0 = Op->getOperand(0);
	SmallVector<SDValue, 16> Elts;
	SDLoc dl(Op);
	unsigned NumElts;
	MVT SVT;
	if (SrcVT.isVector()) {
	NumElts = SrcVT.getVectorNumElements();
	SVT = SrcVT.getVectorElementType();

	// Widen the vector in input in the case of MVT::v2i32.
	// Example: from MVT::v2i32 to MVT::v4i32.
	for (unsigned i = 0, e = NumElts; i != e; ++i)
	Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
	DAG.getIntPtrConstant(i, dl)));
	} else {
	assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
	"Unexpected source type in LowerBITCAST");
	Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
	DAG.getIntPtrConstant(0, dl)));
	Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
	DAG.getIntPtrConstant(1, dl)));
	NumElts = 2;
	SVT = MVT::i32;
	}
	// Explicitly mark the extra elements as Undef.
	Elts.append(NumElts, DAG.getUNDEF(SVT));

	EVT NewVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
	SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
	SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
	Subtarget.hasMMX() && "Unexpected custom BITCAST");
	assert((DstVT == MVT::i64 \|\|
	(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
	"Unexpected custom BITCAST");
	// i64 <=> MMX conversions are Legal.
	if (SrcVT==MVT::i64 && DstVT.isVector())
	return Op;
	if (DstVT==MVT::i64 && SrcVT.isVector())
	return Op;
	// MMX <=> MMX conversions are Legal.
	if (SrcVT.isVector() && DstVT.isVector())
	return Op;
	// All other conversions need to be expanded.
	return SDValue();
	}

	/// Compute the horizontal sum of bytes in V for the elements of VT.
	///
	/// Requires V to be a byte vector and VT to be an integer vector type with
	/// wider elements than V's type. The width of the elements of VT determines
	/// how many bytes of V are summed horizontally to produce each element of the
	/// result.
	static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(V);
	MVT ByteVecVT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
	"Expected value to have byte element type.");
	assert(EltVT != MVT::i8 &&
	"Horizontal byte sum only makes sense for wider elements!");
	unsigned VecSize = VT.getSizeInBits();
	assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");

	// PSADBW instruction horizontally add all bytes and leave the result in i64
	// chunks, thus directly computes the pop count for v2i64 and v4i64.
	if (EltVT == MVT::i64) {
	SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
	return DAG.getBitcast(VT, V);
	}

	if (EltVT == MVT::i32) {
	// We unpack the low half and high half into i32s interleaved with zeros so
	// that we can use PSADBW to horizontally sum them. The most useful part of
	// this is that it lines up the results of two PSADBW instructions to be
	// two v2i64 vectors which concatenated are the 4 population counts. We can
	// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
	SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue V32 = DAG.getBitcast(VT, V);
	SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
	SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);

	// Do the horizontal sums into two v2i64s.
	Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, Low), Zeros);
	High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, High), Zeros);

	// Merge them together.
	MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
	V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
	DAG.getBitcast(ShortVecVT, Low),
	DAG.getBitcast(ShortVecVT, High));

	return DAG.getBitcast(VT, V);
	}

	// The only element type left is i16.
	assert(EltVT == MVT::i16 && "Unknown how to handle type");

	// To obtain pop count for each i16 element starting from the pop count for
	// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
	// right by 8. It is important to shift as i16s as i8 vector shift isn't
	// directly supported.
	SDValue ShifterV = DAG.getConstant(8, DL, VT);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
	DAG.getBitcast(ByteVecVT, V));
	return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	}

	static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned VecSize = VT.getSizeInBits();

	// Implement a lookup table in register by using an algorithm based on:
	// http://wm.ite.pl/articles/sse-popcount.html
	//
	// The general idea is that every lower byte nibble in the input vector is an
	// index into a in-register pre-computed pop count table. We then split up the
	// input vector in two new ones: (1) a vector with only the shifted-right
	// higher nibbles for each byte and (2) a vector with the lower nibbles (and
	// masked out higher ones) for each byte. PSHUFB is used separately with both
	// to index the in-register table. Next, both are added and the result is a
	// i8 vector where each element contains the pop count for input byte.
	//
	// To obtain the pop count for elements != i8, we follow up with the same
	// approach and use additional tricks as described below.
	//
	const int LUT[16] = {/* 0 / 0, / 1 / 1, / 2 / 1, / 3 */ 2,
	/* 4 / 1, / 5 / 2, / 6 / 2, / 7 */ 3,
	/* 8 / 1, / 9 / 2, / a / 2, / b */ 3,
	/* c / 2, / d / 3, / e / 3, / f */ 4};

	int NumByteElts = VecSize / 8;
	MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
	SDValue In = DAG.getBitcast(ByteVecVT, Op);
	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumByteElts; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
	SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);

	// High nibbles
	SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
	SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);

	// Low nibbles
	SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);

	// The input vector is used as the shuffle mask that index elements into the
	// LUT. After counting low and high nibbles, add the vector to obtain the
	// final pop count per i8 element.
	SDValue HighPopCnt =
	DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
	SDValue LowPopCnt =
	DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
	SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);

	if (EltVT == MVT::i8)
	return PopCnt;

	return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
	}

	static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitmath lowering supported.");

	int VecSize = VT.getSizeInBits();
	MVT EltVT = VT.getVectorElementType();
	int Len = EltVT.getSizeInBits();

	// This is the vectorized version of the "best" algorithm from
	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
	// with a minor tweak to use a series of adds + shifts instead of vector
	// multiplications. Implemented for all integer vector types. We only use
	// this when we don't have SSSE3 which allows a LUT-based lowering that is
	// much faster, even faster than using native popcnt instructions.

	auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
	MVT VT = V.getSimpleValueType();
	SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
	return DAG.getNode(OpCode, DL, VT, V, ShifterV);
	};
	auto GetMask = [&](SDValue V, APInt Mask) {
	MVT VT = V.getSimpleValueType();
	SDValue MaskV = DAG.getConstant(Mask, DL, VT);
	return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
	};

	// We don't want to incur the implicit masks required to SRL vNi8 vectors on
	// x86, so set the SRL type to have elements at least i16 wide. This is
	// correct because all of our SRLs are followed immediately by a mask anyways
	// that handles any bits that sneak into the high bits of the byte elements.
	MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);

	SDValue V = Op;

	// v = v - ((v >> 1) & 0x55555555...)
	SDValue Srl =
	DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
	SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
	V = DAG.getNode(ISD::SUB, DL, VT, V, And);

	// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
	SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
	Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
	SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
	V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);

	// v = (v + (v >> 4)) & 0x0F0F0F0F...
	Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
	V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));

	// At this point, V contains the byte-wise population count, and we are
	// merely doing a horizontal sum if necessary to get the wider element
	// counts.
	if (EltVT == MVT::i8)
	return V;

	return LowerHorizontalByteSum(
	DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
	DAG);
	}

	// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
	// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
	static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector()) &&
	"Unknown CTPOP type to handle");
	SDLoc DL(Op.getNode());
	SDValue Op0 = Op.getOperand(0);

	// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
	if (Subtarget.hasVPOPCNTDQ()) {
	unsigned NumElems = VT.getVectorNumElements();
	assert((VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16) && "Unexpected type");
	if (NumElems <= 16) {
	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
	Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
	}
	}

	if (!Subtarget.hasSSSE3()) {
	// We can't use the fast LUT approach, so fall back on vectorized bitmath.
	assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
	return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
	}

	static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().isVector() &&
	"We only do custom lowering for vector population count.");
	return LowerVectorCTPOP(Op, Subtarget, DAG);
	}

	static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	// For scalars, its still beneficial to transfer to/from the SIMD unit to
	// perform the BITREVERSE.
	if (!VT.isVector()) {
	MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
	Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	int NumElts = VT.getVectorNumElements();
	int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector())
	return Lower256IntUnary(Op, DAG);

	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitreverse lowering supported.");

	// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
	// perform the BSWAP in the shuffle.
	// Its best to shuffle using the second operand as this will implicitly allow
	// memory folding for multiple vectors.
	SmallVector<SDValue, 16> MaskElts;
	for (int i = 0; i != NumElts; ++i) {
	for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
	int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
	int PermuteByte = SourceByte \| (2 << 5);
	MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
	}
	}

	SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
	SDValue Res = DAG.getBitcast(MVT::v16i8, In);
	Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
	Res, Mask);
	return DAG.getBitcast(VT, Res);
	}

	static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasXOP() && !VT.is512BitVector())
	return LowerBITREVERSE_XOP(Op, DAG);

	assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");

	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarType() == MVT::i8 &&
	"Only byte vector BITREVERSE supported");

	// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
	// two nibbles and a PSHUFB lookup to find the bitreverse of each
	// 0-15 value (moved to the other nibble).
	SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

	const int LoLUT[16] = {
	/* 0 / 0x00, / 1 / 0x80, / 2 / 0x40, / 3 */ 0xC0,
	/* 4 / 0x20, / 5 / 0xA0, / 6 / 0x60, / 7 */ 0xE0,
	/* 8 / 0x10, / 9 / 0x90, / a / 0x50, / b */ 0xD0,
	/* c / 0x30, / d / 0xB0, / e / 0x70, / f */ 0xF0};
	const int HiLUT[16] = {
	/* 0 / 0x00, / 1 / 0x08, / 2 / 0x04, / 3 */ 0x0C,
	/* 4 / 0x02, / 5 / 0x0A, / 6 / 0x06, / 7 */ 0x0E,
	/* 8 / 0x01, / 9 / 0x09, / a / 0x05, / b */ 0x0D,
	/* c / 0x03, / d / 0x0B, / e / 0x07, / f */ 0x0F};

	SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
	for (unsigned i = 0; i < NumElts; ++i) {
	LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
	HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
	}

	SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
	SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
	Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool AllowIncDec = true) {
	unsigned NewOpc = 0;
	switch (N->getOpcode()) {
	case ISD::ATOMIC_LOAD_ADD:
	NewOpc = X86ISD::LADD;
	break;
	case ISD::ATOMIC_LOAD_SUB:
	NewOpc = X86ISD::LSUB;
	break;
	case ISD::ATOMIC_LOAD_OR:
	NewOpc = X86ISD::LOR;
	break;
	case ISD::ATOMIC_LOAD_XOR:
	NewOpc = X86ISD::LXOR;
	break;
	case ISD::ATOMIC_LOAD_AND:
	NewOpc = X86ISD::LAND;
	break;
	default:
	llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
	}

	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

	if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
	// Convert to inc/dec if they aren't slow or we are optimizing for size.
	if (AllowIncDec && (!Subtarget.slowIncDec() \|\|
	DAG.getMachineFunction().getFunction().optForSize())) {
	if ((NewOpc == X86ISD::LADD && C->isOne()) \|\|
	(NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
	return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
	DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	if ((NewOpc == X86ISD::LSUB && C->isOne()) \|\|
	(NewOpc == X86ISD::LADD && C->isAllOnesValue()))
	return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
	DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	}
	}

	return DAG.getMemIntrinsicNode(
	NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	}

	/// Lower atomic_load_ops into LOCK-prefixed operations.
	static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Chain = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	unsigned Opc = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// We can lower atomic_load_add into LXADD. However, any other atomicrmw op
	// can only be lowered when the result is unused. They should have already
	// been transformed into a cmpxchg loop in AtomicExpand.
	if (N->hasAnyUseOfValue(0)) {
	// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
	// select LXADD if LOCK_SUB can't be selected.
	if (Opc == ISD::ATOMIC_LOAD_SUB) {
	AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
	RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
	RHS, AN->getMemOperand());
	}
	assert(Opc == ISD::ATOMIC_LOAD_ADD &&
	"Used AtomicRMW ops other than Add should have been expanded!");
	return N;
	}

	SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
	// RAUW the chain, but don't worry about the result, as it's unused.
	assert(!N->hasAnyUseOfValue(0));
	DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
	return SDValue();
	}

	static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
	SDNode *Node = Op.getNode();
	SDLoc dl(Node);
	EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();

	// Convert seq_cst store -> xchg
	// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
	// FIXME: On 32-bit, store -> fist or movq would be more efficient
	// (The only way to get a 16-byte store is cmpxchg16b)
	// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
	if (cast<AtomicSDNode>(Node)->getOrdering() ==
	AtomicOrdering::SequentiallyConsistent \|\|
	!DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
	SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
	cast<AtomicSDNode>(Node)->getMemoryVT(),
	Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2),
	cast<AtomicSDNode>(Node)->getMemOperand());
	return Swap.getValue(1);
	}
	// Other atomic stores have a simple pattern.
	return Op;
	}

	static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
	SDNode *N = Op.getNode();
	MVT VT = N->getSimpleValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	SDLoc DL(N);

	// Set the carry flag.
	SDValue Carry = Op.getOperand(2);
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
	SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
	Op.getOperand(1), Carry.getValue(1));

	SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());

	// For MacOSX, we want to call an alternative entry point: __sincos_stret,
	// which returns the values as { float, float } (in XMM0) or
	// { double, double } (which is returned in XMM0, XMM1).
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	bool isF64 = ArgVT == MVT::f64;
	// Only optimize x86_64 for now. i386 is a bit messy. For f32,
	// the small struct {f32, f32} is returned in (eax, edx). For f64,
	// the results are returned via SRet in memory.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
	const char *LibcallName = TLI.getLibcallName(LC);
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = isF64 ? (Type )StructType::get(ArgTy, ArgTy)
	: (Type *)VectorType::get(ArgTy, 4);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	if (isF64)
	// Returned in xmm0 and xmm1.
	return CallResult.first;

	// Returned in bits 0:31 and 32:64 xmm0.
	SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(0, dl));
	SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(1, dl));
	SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
	}

	/// Widen a vector input to a vector of NVT. The
	/// input vector must have the same element type as NVT.
	static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
	bool FillWithZeroes = false) {
	// Check if InOp already has the right width.
	MVT InVT = InOp.getSimpleValueType();
	if (InVT == NVT)
	return InOp;

	if (InOp.isUndef())
	return DAG.getUNDEF(NVT);

	assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
	"input and widen element type must match");

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned WidenNumElts = NVT.getVectorNumElements();
	assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
	"Unexpected request for vector widening");

	SDLoc dl(InOp);
	if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
	InOp.getNumOperands() == 2) {
	SDValue N1 = InOp.getOperand(1);
	if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) \|\|
	N1.isUndef()) {
	InOp = InOp.getOperand(0);
	InVT = InOp.getSimpleValueType();
	InNumElts = InVT.getVectorNumElements();
	}
	}
	if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0; i < InNumElts; ++i)
	Ops.push_back(InOp.getOperand(i));

	EVT EltVT = InOp.getOperand(0).getValueType();

	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
	DAG.getUNDEF(EltVT);
	for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
	Ops.push_back(FillVal);
	return DAG.getBuildVector(NVT, dl, Ops);
	}
	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
	DAG.getUNDEF(NVT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
	InOp, DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"MGATHER/MSCATTER are supported on AVX-512 arch only");

	MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
	SDValue Src = N->getValue();
	MVT VT = Src.getSimpleValueType();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
	SDLoc dl(Op);

	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Chain = N->getChain();
	SDValue BasePtr = N->getBasePtr();
	MVT MemVT = N->getMemoryVT().getSimpleVT();
	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
	// The v2i32 value was promoted to v2i64.
	// Now we "redo" the type legalizer's work and widen the original
	// v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
	// with a shuffle.
	assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
	"Unexpected memory type");
	int ShuffleMask[] = {0, 2, -1, -1};
	Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
	DAG.getUNDEF(MVT::v4i32), ShuffleMask);
	// Now we have 4 elements instead of 2.
	// Expand the index.
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
	Index = ExtendToType(Index, NewIndexVT, DAG);

	// Expand the mask with zeroes
	// Mask may be <2 x i64> or <2 x i1> at this moment
	assert((MaskVT == MVT::v2i1 \|\| MaskVT == MVT::v2i64) &&
	"Unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
	Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
	VT = MVT::v4i32;
	}

	unsigned NumElts = VT.getVectorNumElements();
	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// AVX512F supports only 512-bit vectors. Or data or index should
	// be 512 bit wide. If now the both index and data are 256-bit, but
	// the vector contains 8 elements, we just sign-extend the index
	if (IndexVT == MVT::v8i32)
	// Just extend index
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
	else {
	// The minimal number of elts in scatter is 8
	NumElts = 8;
	// Index
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
	// Use original index here, do not modify the index twice
	Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
	if (IndexVT.getScalarType() == MVT::i32)
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

	// Mask
	// At this point we have promoted mask operand
	assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
	// Use the original mask here, do not modify the mask twice
	Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);

	// The value that should be stored
	MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
	Src = ExtendToType(Src, NewVT, DAG);
	}
	}
	// If the mask is "wide" at this point - truncate it to i1 vector
	MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
	Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);

	// The mask is killed by scatter, add it to the values
	SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
	SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
	DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
	return SDValue(NewScatter.getNode(), 1);
	}

	static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
	MVT VT = Op.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	// 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
	// VLX. These types for exp-loads are handled here.
	if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
	return Op;

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked load op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked load op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
	SDValue Src0 = N->getSrc0();
	Src0 = ExtendToType(Src0, WideDataVT, DAG);

	// Mask element has to be i1.
	MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
	assert((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) &&
	"We handle 4x32, 4x64 and 2x64 vectors only in this case");

	MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);

	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	if (MaskEltTy != MVT::i1)
	Mask = DAG.getNode(ISD::TRUNCATE, dl,
	MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
	SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
	N->getBasePtr(), Mask, Src0,
	N->getMemoryVT(), N->getMemOperand(),
	N->getExtensionType(),
	N->isExpandingLoad());

	SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
	NewLoad.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
	SDValue DataToStore = N->getValue();
	MVT VT = DataToStore.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isCompressingStore() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	// 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
	if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
	return Op;

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked store op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked store op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

	// Mask element has to be i1.
	MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
	assert((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) &&
	"We handle 4x32, 4x64 and 2x64 vectors only in this case");

	MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);

	DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	if (MaskEltTy != MVT::i1)
	Mask = DAG.getNode(ISD::TRUNCATE, dl,
	MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
	return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
	Mask, N->getMemoryVT(), N->getMemOperand(),
	N->isTruncatingStore(), N->isCompressingStore());
	}

	static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");

	MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Src0 = N->getValue();
	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");

	// If the index is v2i32, we're being called by type legalization.
	if (IndexVT == MVT::v2i32)
	return SDValue();

	if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// AVX512F supports only 512-bit vectors. Or data or index should
	// be 512 bit wide. If now the both index and data are 256-bit, but
	// the vector contains 8 elements, we just sign-extend the index
	if (NumElts == 8) {
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());
	return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
	}

	// Minimal number of elements in Gather
	NumElts = 8;
	// Index
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
	Index = ExtendToType(Index, NewIndexVT, DAG);
	if (IndexVT.getScalarType() == MVT::i32)
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

	// Mask
	MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
	// At this point we have promoted mask operand
	assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
	Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
	Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);

	// The pass-through value
	MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
	Src0 = ExtendToType(Src0, NewVT, DAG);

	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
	NewGather.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Extract, NewGather.getValue(2)};
	return DAG.getMergeValues(RetOps, dl);
	}

	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());
	return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	/// Provide custom lowering hooks for some operations.
	SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Should not custom lower this!");
	case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	return LowerCMP_SWAP(Op, Subtarget, DAG);
	case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
	case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
	case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
	case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
	case ISD::VSELECT: return LowerVSELECT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::SHL_PARTS:
	case ISD::SRA_PARTS:
	case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
	case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
	case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
	case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
	case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
	case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
	case ISD::FABS:
	case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
	case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
	case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
	case ISD::SETCC: return LowerSETCC(Op, DAG);
	case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
	case ISD::SELECT: return LowerSELECT(Op, DAG);
	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
	case ISD::VASTART: return LowerVASTART(Op, DAG);
	case ISD::VAARG: return LowerVAARG(Op, DAG);
	case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
	case ISD::FRAME_TO_ARGS_OFFSET:
	return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
	case ISD::EH_SJLJ_SETUP_DISPATCH:
	return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
	case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
	case ISD::MULHS:
	case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
	case ISD::UMUL_LOHI:
	case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
	case ISD::ROTL:
	case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO: return LowerXALUO(Op, DAG);
	case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
	case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
	case ISD::ADDCARRY:
	case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
	case ISD::ADD:
	case ISD::SUB: return LowerADD_SUB(Op, DAG);
	case ISD::SMAX:
	case ISD::SMIN:
	case ISD::UMAX:
	case ISD::UMIN: return LowerMINMAX(Op, DAG);
	case ISD::ABS: return LowerABS(Op, DAG);
	case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
	case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
	case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
	case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
	case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
	case ISD::GC_TRANSITION_START:
	return LowerGC_TRANSITION_START(Op, DAG);
	case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
	case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
	}
	}

	/// Places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	void X86TargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDValue Res = LowerOperation(SDValue(N, 0), DAG);

	if (!Res.getNode())
	return;

	assert((N->getNumValues() <= Res->getNumValues()) &&
	"Lowering returned the wrong number of results!");

	// Places new result values base on N result number.
	// In some cases (LowerSINT_TO_FP for example) Res has more result values
	// than original node, chain should be dropped(last value).
	for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
	Results.push_back(Res.getValue(I));
	}

	/// Replace a node with an illegal result type with a new node built out of
	/// custom code.
	void X86TargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Do not know how to custom type legalize this operation!");
	case X86ISD::AVG: {
	// Legalize types for X86ISD::AVG by expanding vectors.
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

	auto InVT = N->getValueType(0);
	auto InVTSize = InVT.getSizeInBits();
	const unsigned RegSize =
	(InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
	assert((Subtarget.hasBWI() \|\| RegSize < 512) &&
	"512-bit vector requires AVX512BW");
	assert((Subtarget.hasAVX2() \|\| RegSize < 256) &&
	"256-bit vector requires AVX2");

	auto ElemVT = InVT.getVectorElementType();
	auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
	RegSize / ElemVT.getSizeInBits());
	assert(RegSize % InVT.getSizeInBits() == 0);
	unsigned NumConcat = RegSize / InVT.getSizeInBits();

	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
	Ops[0] = N->getOperand(0);
	SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
	Ops[0] = N->getOperand(1);
	SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);

	SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
	if (!ExperimentalVectorWideningLegalization)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
	case X86ISD::FMINC:
	case X86ISD::FMIN:
	case X86ISD::FMAXC:
	case X86ISD::FMAX: {
	EVT VT = N->getValueType(0);
	assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
	SDValue UNDEF = DAG.getUNDEF(VT);
	SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(0), UNDEF);
	SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(1), UNDEF);
	Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
	return;
	}
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM:
	case ISD::SDIVREM:
	case ISD::UDIVREM: {
	SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
	Results.push_back(V);
	return;
	}
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: {
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;

	if (N->getValueType(0) == MVT::v2i32) {
	assert((IsSigned \|\| Subtarget.hasAVX512()) &&
	"Can only handle signed conversion without AVX512");
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	SDValue Src = N->getOperand(0);
	if (Src.getValueType() == MVT::v2f64) {
	MVT ResVT = MVT::v4i32;
	unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	if (!IsSigned && !Subtarget.hasVLX()) {
	// Widen to 512-bits.
	ResVT = MVT::v8i32;
	Opc = ISD::FP_TO_UINT;
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
	DAG.getUNDEF(MVT::v8f64),
	Src, DAG.getIntPtrConstant(0, dl));
	}
	SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
	ResVT = ExperimentalVectorWideningLegalization ? MVT::v4i32
	: MVT::v2i32;
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	if (Src.getValueType() == MVT::v2f32) {
	SDValue Idx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32));
	Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
	: ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
	if (!ExperimentalVectorWideningLegalization)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
	Results.push_back(Res);
	return;
	}

	// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
	// so early out here.
	return;
	}

	std::pair<SDValue,SDValue> Vals =
	FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /IsReplace=/ true);
	SDValue FIST = Vals.first, StackSlot = Vals.second;
	if (FIST.getNode()) {
	EVT VT = N->getValueType(0);
	// Return a load from the stack slot.
	if (StackSlot.getNode())
	Results.push_back(
	DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
	else
	Results.push_back(FIST);
	}
	return;
	}
	case ISD::SINT_TO_FP: {
	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	SDValue Src = N->getOperand(0);
	if (N->getValueType(0) != MVT::v2f32 \|\| Src.getValueType() != MVT::v2i64)
	return;
	Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
	return;
	}
	case ISD::UINT_TO_FP: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT VT = N->getValueType(0);
	if (VT != MVT::v2f32)
	return;
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
	Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
	return;
	}
	if (SrcVT != MVT::v2i32)
	return;
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
	SDValue VBias =
	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
	DAG.getBitcast(MVT::v2i64, VBias));
	Or = DAG.getBitcast(MVT::v2f64, Or);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
	Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
	return;
	}
	case ISD::FP_ROUND: {
	if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
	return;
	SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
	Results.push_back(V);
	return;
	}
	case ISD::FP_EXTEND: {
	// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
	// No other ValueType for FP_EXTEND should reach this point.
	assert(N->getValueType(0) == MVT::v2f32 &&
	"Do not know how to legalize this Node");
	return;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default : llvm_unreachable("Do not know how to custom type "
	"legalize this intrinsic operation!");
	case Intrinsic::x86_rdtsc:
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdtscp:
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdpmc:
	return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);

	case Intrinsic::x86_xgetbv:
	return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
	}
	}
	case ISD::INTRINSIC_WO_CHAIN: {
	if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
	Results.push_back(V);
	return;
	}
	case ISD::READCYCLECOUNTER: {
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	}
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
	EVT T = N->getValueType(0);
	assert((T == MVT::i64 \|\| T == MVT::i128) && "can only expand cmpxchg pair");
	bool Regs64bit = T == MVT::i128;
	MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
	SDValue cpInL, cpInH;
	cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(0, dl, HalfT));
	cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(1, dl, HalfT));
	cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	cpInL, SDValue());
	cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	cpInH, cpInL.getValue(1));
	SDValue swapInL, swapInH;
	swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(0, dl, HalfT));
	swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(1, dl, HalfT));
	swapInH =
	DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
	swapInH, cpInH.getValue(1));
	// If the current function needs the base pointer, RBX,
	// we shouldn't use cmpxchg directly.
	// Indeed the lowering of that instruction will clobber
	// that register and since RBX will be a reserved register
	// the register allocator will not make sure its value will
	// be properly saved and restored around this live-range.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	SDValue Result;
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	unsigned BasePtr = TRI->getBaseRegister();
	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
	if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
	(BasePtr == X86::RBX \|\| BasePtr == X86::EBX)) {
	// ISel prefers the LCMPXCHG64 variant.
	// If that assert breaks, that means it is not the case anymore,
	// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
	// not just EBX. This is a matter of accepting i64 input for that
	// pseudo, and restoring into the register of the right wide
	// in expand pseudo. Everything else should just work.
	assert(((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX) &&
	"Saving only half of the RBX");
	unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
	: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
	SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX,
	HalfT, swapInH.getValue(1));
	SDValue Ops[] = {/Chain/ RBXSave.getValue(1), N->getOperand(1), swapInL,
	RBXSave,
	/Glue/ RBXSave.getValue(2)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	} else {
	unsigned Opcode =
	Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
	swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX, swapInL,
	swapInH.getValue(1));
	SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
	swapInL.getValue(1)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	}
	SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	HalfT, Result.getValue(1));
	SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	HalfT, cpOutL.getValue(2));
	SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

	SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
	MVT::i32, cpOutH.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
	Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
	Results.push_back(Success);
	Results.push_back(EFLAGS.getValue(1));
	return;
	}
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	case ISD::ATOMIC_LOAD: {
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;
	}
	case ISD::BITCAST: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT DstVT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0).getValueType();

	if (SrcVT != MVT::f64 \|\|
	(DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
	return;

	unsigned NumElts = DstVT.getVectorNumElements();
	EVT SVT = DstVT.getVectorElementType();
	EVT WiderVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
	SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	MVT::v2f64, N->getOperand(0));
	SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);

	if (ExperimentalVectorWideningLegalization) {
	// If we are legalizing vectors by widening, we already have the desired
	// legal vector type, just return it.
	Results.push_back(ToVecInt);
	return;
	}

	SmallVector<SDValue, 8> Elts;
	for (unsigned i = 0, e = NumElts; i != e; ++i)
	Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
	ToVecInt, DAG.getIntPtrConstant(i, dl)));

	Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
	return;
	}
	case ISD::MGATHER: {
	EVT VT = N->getValueType(0);
	if (VT == MVT::v2f32 && (Subtarget.hasVLX() \|\| !Subtarget.hasAVX512())) {
	auto *Gather = cast<MaskedGatherSDNode>(N);
	SDValue Index = Gather->getIndex();
	if (Index.getValueType() != MVT::v2i64)
	return;
	SDValue Mask = Gather->getMask();
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	Gather->getValue(),
	DAG.getUNDEF(MVT::v2f32));
	if (!Subtarget.hasVLX()) {
	// We need to widen the mask, but the instruction will only use 2
	// of its elements. So we can use undef.
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getUNDEF(MVT::v2i1));
	Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
	}
	SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
	Index };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
	Gather->getMemoryVT(), Gather->getMemOperand());
	Results.push_back(Res);
	Results.push_back(Res.getValue(2));
	return;
	}
	if (VT == MVT::v2i32) {
	auto *Gather = cast<MaskedGatherSDNode>(N);
	SDValue Index = Gather->getIndex();
	SDValue Mask = Gather->getMask();
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
	Gather->getValue(),
	DAG.getUNDEF(MVT::v2i32));
	// If the index is v2i64 we can use it directly.
	if (Index.getValueType() == MVT::v2i64 &&
	(Subtarget.hasVLX() \|\| !Subtarget.hasAVX512())) {
	if (!Subtarget.hasVLX()) {
	// We need to widen the mask, but the instruction will only use 2
	// of its elements. So we can use undef.
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getUNDEF(MVT::v2i1));
	Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
	}
	SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
	Index };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
	Gather->getMemoryVT(), Gather->getMemOperand());
	SDValue Chain = Res.getValue(2);
	if (!ExperimentalVectorWideningLegalization)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	EVT IndexVT = Index.getValueType();
	EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
	IndexVT.getScalarType(), 4);
	// Otherwise we need to custom widen everything to avoid promotion.
	Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
	DAG.getUNDEF(IndexVT));
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getConstant(0, dl, MVT::v2i1));
	SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
	Index };
	SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
	Gather->getMemoryVT(), dl, Ops,
	Gather->getMemOperand());
	SDValue Chain = Res.getValue(1);
	if (!ExperimentalVectorWideningLegalization)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	break;
	}
	}
	}

	const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((X86ISD::NodeType)Opcode) {
	case X86ISD::FIRST_NUMBER: break;
	case X86ISD::BSF: return "X86ISD::BSF";
	case X86ISD::BSR: return "X86ISD::BSR";
	case X86ISD::SHLD: return "X86ISD::SHLD";
	case X86ISD::SHRD: return "X86ISD::SHRD";
	case X86ISD::FAND: return "X86ISD::FAND";
	case X86ISD::FANDN: return "X86ISD::FANDN";
	case X86ISD::FOR: return "X86ISD::FOR";
	case X86ISD::FXOR: return "X86ISD::FXOR";
	case X86ISD::FILD: return "X86ISD::FILD";
	case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
	case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
	case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
	case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
	case X86ISD::FLD: return "X86ISD::FLD";
	case X86ISD::FST: return "X86ISD::FST";
	case X86ISD::CALL: return "X86ISD::CALL";
	case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
	case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
	case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
	case X86ISD::BT: return "X86ISD::BT";
	case X86ISD::CMP: return "X86ISD::CMP";
	case X86ISD::COMI: return "X86ISD::COMI";
	case X86ISD::UCOMI: return "X86ISD::UCOMI";
	case X86ISD::CMPM: return "X86ISD::CMPM";
	case X86ISD::CMPMU: return "X86ISD::CMPMU";
	case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
	case X86ISD::SETCC: return "X86ISD::SETCC";
	case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
	case X86ISD::FSETCC: return "X86ISD::FSETCC";
	case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
	case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
	case X86ISD::CMOV: return "X86ISD::CMOV";
	case X86ISD::BRCOND: return "X86ISD::BRCOND";
	case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
	case X86ISD::IRET: return "X86ISD::IRET";
	case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
	case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
	case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
	case X86ISD::Wrapper: return "X86ISD::Wrapper";
	case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
	case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
	case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
	case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
	case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
	case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
	case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
	case X86ISD::PINSRB: return "X86ISD::PINSRB";
	case X86ISD::PINSRW: return "X86ISD::PINSRW";
	case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
	case X86ISD::ANDNP: return "X86ISD::ANDNP";
	case X86ISD::BLENDI: return "X86ISD::BLENDI";
	case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
	case X86ISD::ADDUS: return "X86ISD::ADDUS";
	case X86ISD::SUBUS: return "X86ISD::SUBUS";
	case X86ISD::HADD: return "X86ISD::HADD";
	case X86ISD::HSUB: return "X86ISD::HSUB";
	case X86ISD::FHADD: return "X86ISD::FHADD";
	case X86ISD::FHSUB: return "X86ISD::FHSUB";
	case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
	case X86ISD::FMAX: return "X86ISD::FMAX";
	case X86ISD::FMAXS: return "X86ISD::FMAXS";
	case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
	case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
	case X86ISD::FMIN: return "X86ISD::FMIN";
	case X86ISD::FMINS: return "X86ISD::FMINS";
	case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
	case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
	case X86ISD::FMAXC: return "X86ISD::FMAXC";
	case X86ISD::FMINC: return "X86ISD::FMINC";
	case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
	case X86ISD::FRCP: return "X86ISD::FRCP";
	case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
	case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
	case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
	case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
	case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
	case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
	case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
	case X86ISD::EH_SJLJ_SETUP_DISPATCH:
	return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
	case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
	case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
	case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
	case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
	case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
	case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
	case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
	case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
	return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
	case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
	return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
	case X86ISD::LADD: return "X86ISD::LADD";
	case X86ISD::LSUB: return "X86ISD::LSUB";
	case X86ISD::LOR: return "X86ISD::LOR";
	case X86ISD::LXOR: return "X86ISD::LXOR";
	case X86ISD::LAND: return "X86ISD::LAND";
	case X86ISD::LINC: return "X86ISD::LINC";
	case X86ISD::LDEC: return "X86ISD::LDEC";
	case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
	case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
	case X86ISD::VZEXT: return "X86ISD::VZEXT";
	case X86ISD::VSEXT: return "X86ISD::VSEXT";
	case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
	case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
	case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
	case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
	case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
	case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
	case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
	case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
	case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
	case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
	case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
	case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
	case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
	case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
	case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
	case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
	case X86ISD::VSHL: return "X86ISD::VSHL";
	case X86ISD::VSRL: return "X86ISD::VSRL";
	case X86ISD::VSRA: return "X86ISD::VSRA";
	case X86ISD::VSHLI: return "X86ISD::VSHLI";
	case X86ISD::VSRLI: return "X86ISD::VSRLI";
	case X86ISD::VSRAI: return "X86ISD::VSRAI";
	case X86ISD::VSRAV: return "X86ISD::VSRAV";
	case X86ISD::VROTLI: return "X86ISD::VROTLI";
	case X86ISD::VROTRI: return "X86ISD::VROTRI";
	case X86ISD::VPPERM: return "X86ISD::VPPERM";
	case X86ISD::CMPP: return "X86ISD::CMPP";
	case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
	case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
	case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
	case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
	case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
	case X86ISD::ADD: return "X86ISD::ADD";
	case X86ISD::SUB: return "X86ISD::SUB";
	case X86ISD::ADC: return "X86ISD::ADC";
	case X86ISD::SBB: return "X86ISD::SBB";
	case X86ISD::SMUL: return "X86ISD::SMUL";
	case X86ISD::UMUL: return "X86ISD::UMUL";
	case X86ISD::SMUL8: return "X86ISD::SMUL8";
	case X86ISD::UMUL8: return "X86ISD::UMUL8";
	case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
	case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
	case X86ISD::INC: return "X86ISD::INC";
	case X86ISD::DEC: return "X86ISD::DEC";
	case X86ISD::OR: return "X86ISD::OR";
	case X86ISD::XOR: return "X86ISD::XOR";
	case X86ISD::AND: return "X86ISD::AND";
	case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
	case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
	case X86ISD::PTEST: return "X86ISD::PTEST";
	case X86ISD::TESTP: return "X86ISD::TESTP";
	case X86ISD::TESTM: return "X86ISD::TESTM";
	case X86ISD::TESTNM: return "X86ISD::TESTNM";
	case X86ISD::KORTEST: return "X86ISD::KORTEST";
	case X86ISD::KTEST: return "X86ISD::KTEST";
	case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
	case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
	case X86ISD::PACKSS: return "X86ISD::PACKSS";
	case X86ISD::PACKUS: return "X86ISD::PACKUS";
	case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
	case X86ISD::VALIGN: return "X86ISD::VALIGN";
	case X86ISD::VSHLD: return "X86ISD::VSHLD";
	case X86ISD::VSHRD: return "X86ISD::VSHRD";
	case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
	case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
	case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
	case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
	case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
	case X86ISD::SHUFP: return "X86ISD::SHUFP";
	case X86ISD::SHUF128: return "X86ISD::SHUF128";
	case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
	case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
	case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
	case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
	case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
	case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
	case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
	case X86ISD::MOVSD: return "X86ISD::MOVSD";
	case X86ISD::MOVSS: return "X86ISD::MOVSS";
	case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
	case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
	case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
	case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
	case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
	case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
	case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
	case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
	case X86ISD::VPERMV: return "X86ISD::VPERMV";
	case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
	case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
	case X86ISD::VPERMI: return "X86ISD::VPERMI";
	case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
	case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
	case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
	case X86ISD::VRANGE: return "X86ISD::VRANGE";
	case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
	case X86ISD::VRANGES: return "X86ISD::VRANGES";
	case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
	case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
	case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
	case X86ISD::PSADBW: return "X86ISD::PSADBW";
	case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
	case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
	case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
	case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
	case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
	case X86ISD::MFENCE: return "X86ISD::MFENCE";
	case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
	case X86ISD::SAHF: return "X86ISD::SAHF";
	case X86ISD::RDRAND: return "X86ISD::RDRAND";
	case X86ISD::RDSEED: return "X86ISD::RDSEED";
	case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
	case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
	case X86ISD::VPSHA: return "X86ISD::VPSHA";
	case X86ISD::VPSHL: return "X86ISD::VPSHL";
	case X86ISD::VPCOM: return "X86ISD::VPCOM";
	case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
	case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
	case X86ISD::FMSUB: return "X86ISD::FMSUB";
	case X86ISD::FNMADD: return "X86ISD::FNMADD";
	case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
	case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
	case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
	case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
	case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
	case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
	case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
	case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
	case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
	case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
	case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
	case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
	case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
	case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
	case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
	case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
	case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
	case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
	case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
	case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
	case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
	case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
	case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
	case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
	case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
	case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
	case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
	case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
	case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
	case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
	case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
	case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
	case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
	case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
	case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
	case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
	case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
	case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
	case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
	case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
	case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
	case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
	case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
	case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
	case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
	case X86ISD::XTEST: return "X86ISD::XTEST";
	case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
	case X86ISD::EXPAND: return "X86ISD::EXPAND";
	case X86ISD::SELECT: return "X86ISD::SELECT";
	case X86ISD::SELECTS: return "X86ISD::SELECTS";
	case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
	case X86ISD::RCP14: return "X86ISD::RCP14";
	case X86ISD::RCP14S: return "X86ISD::RCP14S";
	case X86ISD::RCP28: return "X86ISD::RCP28";
	case X86ISD::RCP28S: return "X86ISD::RCP28S";
	case X86ISD::EXP2: return "X86ISD::EXP2";
	case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
	case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
	case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
	case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
	case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
	case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
	case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
	case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
	case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
	case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
	case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
	case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
	case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
	case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
	case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
	case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
	case X86ISD::SCALEF: return "X86ISD::SCALEF";
	case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
	case X86ISD::ADDS: return "X86ISD::ADDS";
	case X86ISD::SUBS: return "X86ISD::SUBS";
	case X86ISD::AVG: return "X86ISD::AVG";
	case X86ISD::MULHRS: return "X86ISD::MULHRS";
	case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
	case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
	case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
	case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
	case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
	case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
	case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
	case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
	case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
	case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
	case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
	case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
	case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
	case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
	case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
	case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
	case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
	case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
	case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
	case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
	case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
	case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
	case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
	case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
	case X86ISD::LWPINS: return "X86ISD::LWPINS";
	case X86ISD::MGATHER: return "X86ISD::MGATHER";
	case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
	case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
	case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
	case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
	case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
	case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
	case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
	case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
	case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
	}
	return nullptr;
	}

	/// Return true if the addressing mode represented by AM is legal for this
	/// target, for a load/store of the specified type.
	bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I) const {
	// X86 supports extremely general addressing modes.
	CodeModel::Model M = getTargetMachine().getCodeModel();

	// X86 allows a sign-extended 32-bit immediate field as a displacement.
	if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
	return false;

	if (AM.BaseGV) {
	unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

	// If a reference to this global requires an extra load, we can't fold it.
	if (isGlobalStubReference(GVFlags))
	return false;

	// If BaseGV requires a register for the PIC base, we cannot also have a
	// BaseReg specified.
	if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
	return false;

	// If lower 4G is not available, then we must use rip-relative addressing.
	if ((M != CodeModel::Small \|\| isPositionIndependent()) &&
	Subtarget.is64Bit() && (AM.BaseOffs \|\| AM.Scale > 1))
	return false;
	}

	switch (AM.Scale) {
	case 0:
	case 1:
	case 2:
	case 4:
	case 8:
	// These scales always work.
	break;
	case 3:
	case 5:
	case 9:
	// These scales are formed with basereg+scalereg. Only accept if there is
	// no basereg yet.
	if (AM.HasBaseReg)
	return false;
	break;
	default: // Other stuff never works.
	return false;
	}

	return true;
	}

	bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
	unsigned Bits = Ty->getScalarSizeInBits();

	// 8-bit shifts are always expensive, but versions with a scalar amount aren't
	// particularly cheaper than those without.
	if (Bits == 8)
	return false;

	// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
	// shifts just as cheap as scalar ones.
	if (Subtarget.hasAVX2() && (Bits == 32 \|\| Bits == 64))
	return false;

	// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
	// fully general vector.
	return true;
	}

	bool X86TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::allowTruncateForTailCall(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;

	if (!isTypeLegal(EVT::getEVT(Ty1)))
	return false;

	assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");

	// Assuming the caller doesn't have a zeroext or signext return parameter,
	// truncation all the way down to i1 is valid.
	return true;
	}

	bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
	// Can also use sub to handle negated immediates.
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (!VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2))
	return true;

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	if (!VT1.isSimple() \|\| !VT1.isInteger() \|\|
	!VT2.isSimple() \|\| !VT2.isInteger())
	return false;

	switch (VT1.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	// X86 has 8, 16, and 32-bit zero-extending loads.
	return true;
	}

	return false;
	}

	bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }

	bool
	X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	if (!Subtarget.hasAnyFMA())
	return false;

	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
	// i16 instructions are longer (0x66 prefix) and potentially slower.
	return !(VT1 == MVT::i32 && VT2 == MVT::i16);
	}

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks.
	/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
	/// are assumed to be legal.
	bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
	if (!VT.isSimple())
	return false;

	// Not for i1 vectors
	if (VT.getSimpleVT().getScalarType() == MVT::i1)
	return false;

	// Very little shuffling can be done for 64-bit vectors right now.
	if (VT.getSimpleVT().getSizeInBits() == 64)
	return false;

	// We only care that the types being shuffled are legal. The lowering can
	// handle any possible shuffle mask that results.
	return isTypeLegal(VT.getSimpleVT());
	}

	bool
	X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
	EVT VT) const {
	// Just delegate to the generic legality, clear masks aren't special.
	return isShuffleMaskLegal(Mask, VT);
	}

	+bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
	+ // If the subtarget is using retpolines, we need to not generate jump tables.
	+ if (Subtarget.useRetpoline())
	+ return false;
	+
	+ // Otherwise, fallback on the generic logic.
	+ return TargetLowering::areJTsAllowed(Fn);
	+}
	+
	//===----------------------------------------------------------------------===//
	// X86 Scheduler Hooks
	//===----------------------------------------------------------------------===//

	/// Utility function to emit xbegin specifying the start of an RTM region.
	static MachineBasicBlock emitXBegin(MachineInstr &MI, MachineBasicBlock MBB,
	const TargetInstrInfo *TII) {
	DebugLoc DL = MI.getDebugLoc();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// For the v = xbegin(), we generate
	//
	// thisMBB:
	// xbegin sinkMBB
	//
	// mainMBB:
	// s0 = -1
	//
	// fallBB:
	// eax = # XABORT_DEF
	// s1 = eax
	//
	// sinkMBB:
	// v = phi(s0/mainBB, s1/fallBB)

	MachineBasicBlock *thisMBB = MBB;
	MachineFunction *MF = MBB->getParent();
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	unsigned DstReg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned fallDstReg = MRI.createVirtualRegister(RC);

	// thisMBB:
	// xbegin fallMBB
	// # fallthrough to mainMBB
	// # abortion to fallMBB
	BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(fallMBB);

	// mainMBB:
	// mainDstReg := -1
	BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
	BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	mainMBB->addSuccessor(sinkMBB);

	// fallMBB:
	// ; pseudo instruction to model hardware's definition from XABORT
	// EAX := XABORT_DEF
	// fallDstReg := EAX
	BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
	BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
	.addReg(X86::EAX);
	fallMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
	BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(fallDstReg).addMBB(fallMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
	// or XMM0_V32I8 in AVX all of this code can be replaced with that
	// in the .td file.
	static MachineBasicBlock emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock BB,
	const TargetInstrInfo *TII) {
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
	case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
	case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
	case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
	case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
	case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
	case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
	case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
	}

	DebugLoc dl = MI.getDebugLoc();
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

	unsigned NumArgs = MI.getNumOperands();
	for (unsigned i = 1; i < NumArgs; ++i) {
	MachineOperand &Op = MI.getOperand(i);
	if (!(Op.isReg() && Op.isImplicit()))
	MIB.add(Op);
	}
	if (MI.hasOneMemOperand())
	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::XMM0);

	MI.eraseFromParent();
	return BB;
	}

	// FIXME: Custom handling because TableGen doesn't support multiple implicit
	// defs in an instruction pattern
	static MachineBasicBlock emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock BB,
	const TargetInstrInfo *TII) {
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
	case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
	case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
	case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
	case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
	case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
	case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
	case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
	}

	DebugLoc dl = MI.getDebugLoc();
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

	unsigned NumArgs = MI.getNumOperands(); // remove the results
	for (unsigned i = 1; i < NumArgs; ++i) {
	MachineOperand &Op = MI.getOperand(i);
	if (!(Op.isReg() && Op.isImplicit()))
	MIB.add(Op);
	}
	if (MI.hasOneMemOperand())
	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::ECX);

	MI.eraseFromParent();
	return BB;
	}

	static MachineBasicBlock emitWRPKRU(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	// insert input VAL into EAX
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
	.addReg(MI.getOperand(0).getReg());
	// insert zero to ECX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);

	// insert zero to EDX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);

	// insert WRPKRU instruction
	BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitRDPKRU(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	// insert zero to ECX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);

	// insert RDPKRU instruction
	BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::EAX);

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitMonitor(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget,
	unsigned Opc) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	// Address into RAX/EAX, other two args into ECX, EDX.
	unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
	unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));

	unsigned ValOps = X86::AddrNumOperands;
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
	.addReg(MI.getOperand(ValOps).getReg());
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
	.addReg(MI.getOperand(ValOps + 1).getReg());

	// The instruction doesn't actually take any operands though.
	BuildMI(*BB, MI, dl, TII->get(Opc));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitClzero(MachineInstr MI, MachineBasicBlock *BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI->getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	// Address into RAX/EAX
	unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
	unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI->getOperand(i));

	// The instruction doesn't actually take any operands though.
	BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));

	MI->eraseFromParent(); // The pseudo is gone now.
	return BB;
	}



	MachineBasicBlock *
	X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// Emit va_arg instruction on X86-64.

	// Operands to this pseudo-instruction:
	// 0 ) Output : destination address (reg)
	// 1-5) Input : va_list address (addr, i64mem)
	// 6 ) ArgSize : Size (in bytes) of vararg type
	// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
	// 8 ) Align : Alignment of type
	// 9 ) EFLAGS (implicit-def)

	assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
	static_assert(X86::AddrNumOperands == 5,
	"VAARG_64 assumes 5 address operands");

	unsigned DestReg = MI.getOperand(0).getReg();
	MachineOperand &Base = MI.getOperand(1);
	MachineOperand &Scale = MI.getOperand(2);
	MachineOperand &Index = MI.getOperand(3);
	MachineOperand &Disp = MI.getOperand(4);
	MachineOperand &Segment = MI.getOperand(5);
	unsigned ArgSize = MI.getOperand(6).getImm();
	unsigned ArgMode = MI.getOperand(7).getImm();
	unsigned Align = MI.getOperand(8).getImm();

	// Memory Reference
	assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	// Machine Information
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
	const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
	DebugLoc DL = MI.getDebugLoc();

	// struct va_list {
	// i32 gp_offset
	// i32 fp_offset
	// i64 overflow_area (address)
	// i64 reg_save_area (address)
	// }
	// sizeof(va_list) = 24
	// alignment(va_list) = 8

	unsigned TotalNumIntRegs = 6;
	unsigned TotalNumXMMRegs = 8;
	bool UseGPOffset = (ArgMode == 1);
	bool UseFPOffset = (ArgMode == 2);
	unsigned MaxOffset = TotalNumIntRegs * 8 +
	(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

	/* Align ArgSize to a multiple of 8 */
	unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
	bool NeedsAlign = (Align > 8);

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *overflowMBB;
	MachineBasicBlock *offsetMBB;
	MachineBasicBlock *endMBB;

	unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
	unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
	unsigned OffsetReg = 0;

	if (!UseGPOffset && !UseFPOffset) {
	// If we only pull from the overflow region, we don't create a branch.
	// We don't need to alter control flow.
	OffsetDestReg = 0; // unused
	OverflowDestReg = DestReg;

	offsetMBB = nullptr;
	overflowMBB = thisMBB;
	endMBB = thisMBB;
	} else {
	// First emit code to check if gp_offset (or fp_offset) is below the bound.
	// If so, pull the argument from reg_save_area. (branch to offsetMBB)
	// If not, pull from overflow_area. (branch to overflowMBB)
	//
	// thisMBB
	// \| .
	// \| .
	// offsetMBB overflowMBB
	// \| .
	// \| .
	// endMBB

	// Registers for the PHI in endMBB
	OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
	OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *MF = MBB->getParent();
	overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = ++MBB->getIterator();

	// Insert the new basic blocks
	MF->insert(MBBIter, offsetMBB);
	MF->insert(MBBIter, overflowMBB);
	MF->insert(MBBIter, endMBB);

	// Transfer the remainder of MBB and its successor edges to endMBB.
	endMBB->splice(endMBB->begin(), thisMBB,
	std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
	endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

	// Make offsetMBB and overflowMBB successors of thisMBB
	thisMBB->addSuccessor(offsetMBB);
	thisMBB->addSuccessor(overflowMBB);

	// endMBB is a successor of both offsetMBB and overflowMBB
	offsetMBB->addSuccessor(endMBB);
	overflowMBB->addSuccessor(endMBB);

	// Load the offset value into a register
	OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// Check if there is enough room left to pull this argument.
	BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
	.addReg(OffsetReg)
	.addImm(MaxOffset + 8 - ArgSizeA8);

	// Branch to "overflowMBB" if offset >= max
	// Fall through to "offsetMBB" otherwise
	BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
	.addMBB(overflowMBB);
	}

	// In offsetMBB, emit code to use the reg_save_area.
	if (offsetMBB) {
	assert(OffsetReg != 0);

	// Read the reg_save_area address.
	unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 16)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// Zero-extend the offset
	unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
	.addImm(0)
	.addReg(OffsetReg)
	.addImm(X86::sub_32bit);

	// Add the offset to the reg_save_area to get the final address.
	BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
	.addReg(OffsetReg64)
	.addReg(RegSaveReg);

	// Compute the offset for the next argument
	unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
	.addReg(OffsetReg)
	.addImm(UseFPOffset ? 16 : 8);

	// Store it back into the va_list.
	BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.addReg(NextOffsetReg)
	.setMemRefs(MMOBegin, MMOEnd);

	// Jump to endMBB
	BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
	.addMBB(endMBB);
	}

	//
	// Emit code to use overflow area
	//

	// Load the overflow_area address into a register.
	unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// If we need to align it, do so. Otherwise, just copy the address
	// to OverflowDestReg.
	if (NeedsAlign) {
	// Align the overflow address
	assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
	unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);

	// aligned_addr = (addr + (align-1)) & ~(align-1)
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
	.addReg(OverflowAddrReg)
	.addImm(Align-1);

	BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
	.addReg(TmpReg)
	.addImm(~(uint64_t)(Align-1));
	} else {
	BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
	.addReg(OverflowAddrReg);
	}

	// Compute the next overflow address after this argument.
	// (the overflow address should be kept 8-byte aligned)
	unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
	.addReg(OverflowDestReg)
	.addImm(ArgSizeA8);

	// Store the new overflow address.
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.addReg(NextAddrReg)
	.setMemRefs(MMOBegin, MMOEnd);

	// If we branched, emit the PHI to the front of endMBB.
	if (offsetMBB) {
	BuildMI(*endMBB, endMBB->begin(), DL,
	TII->get(X86::PHI), DestReg)
	.addReg(OffsetDestReg).addMBB(offsetMBB)
	.addReg(OverflowDestReg).addMBB(overflowMBB);
	}

	// Erase the pseudo instruction
	MI.eraseFromParent();

	return endMBB;
	}

	MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *MBB) const {
	// Emit code to save XMM registers to the stack. The ABI says that the
	// number of registers to save is given in %al, so it's theoretically
	// possible to do an indirect jump trick to avoid saving all of them,
	// however this code takes a simpler approach and just executes all
	// of the stores if %al is non-zero. It's less code, and it's probably
	// easier on the hardware branch predictor, and stores aren't all that
	// expensive anyway.

	// Create the new basic blocks. One block contains all the XMM stores,
	// and one block is the final destination regardless of whether any
	// stores were performed.
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *F = MBB->getParent();
	MachineFunction::iterator MBBIter = ++MBB->getIterator();
	MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(MBBIter, XMMSaveMBB);
	F->insert(MBBIter, EndMBB);

	// Transfer the remainder of MBB and its successor edges to EndMBB.
	EndMBB->splice(EndMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// The original block will now fall through to the XMM save block.
	MBB->addSuccessor(XMMSaveMBB);
	// The XMMSaveMBB will fall through to the end block.
	XMMSaveMBB->addSuccessor(EndMBB);

	// Now add the instructions.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	unsigned CountReg = MI.getOperand(0).getReg();
	int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
	int64_t VarArgsFPOffset = MI.getOperand(2).getImm();

	if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
	// If %al is 0, branch around the XMM save block.
	BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
	BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
	MBB->addSuccessor(EndMBB);
	}

	// Make sure the last operand is EFLAGS, which gets clobbered by the branch
	// that was just emitted, but clearly shouldn't be "saved".
	assert((MI.getNumOperands() <= 3 \|\|
	!MI.getOperand(MI.getNumOperands() - 1).isReg() \|\|
	MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
	"Expected last argument to be EFLAGS");
	unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
	// In the XMM save block, save all the XMM argument registers.
	for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
	int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
	MachineMemOperand *MMO = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
	MachineMemOperand::MOStore,
	/Size=/16, /Align=/16);
	BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
	.addFrameIndex(RegSaveFrameIndex)
	.addImm(/Scale=/1)
	.addReg(/IndexReg=/0)
	.addImm(/Disp=/Offset)
	.addReg(/Segment=/0)
	.addReg(MI.getOperand(i).getReg())
	.addMemOperand(MMO);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.

	return EndMBB;
	}

	// The EFLAGS operand of SelectItr might be missing a kill marker
	// because there were multiple uses of EFLAGS, and ISel didn't know
	// which to mark. Figure out whether SelectItr should have had a
	// kill marker, and set it if it should. Returns the correct kill
	// marker value.
	static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
	MachineBasicBlock* BB,
	const TargetRegisterInfo* TRI) {
	// Scan forward through BB for a use/def of EFLAGS.
	MachineBasicBlock::iterator miI(std::next(SelectItr));
	for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
	const MachineInstr& mi = *miI;
	if (mi.readsRegister(X86::EFLAGS))
	return false;
	if (mi.definesRegister(X86::EFLAGS))
	break; // Should have kill-flag - update below.
	}

	// If we hit the end of the block, check whether EFLAGS is live into a
	// successor.
	if (miI == BB->end()) {
	for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
	sEnd = BB->succ_end();
	sItr != sEnd; ++sItr) {
	MachineBasicBlock* succ = *sItr;
	if (succ->isLiveIn(X86::EFLAGS))
	return false;
	}
	}

	// We found a def, or hit the end of the basic block and EFLAGS wasn't live
	// out. SelectMI should have a kill flag on EFLAGS.
	SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
	return true;
	}

	// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
	// together with other CMOV pseudo-opcodes into a single basic-block with
	// conditional jump around it.
	static bool isCMOVPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case X86::CMOV_FR32:
	case X86::CMOV_FR64:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_V2F64:
	case X86::CMOV_V2I64:
	case X86::CMOV_V4F32:
	case X86::CMOV_V4F64:
	case X86::CMOV_V4I64:
	case X86::CMOV_V16F32:
	case X86::CMOV_V8F32:
	case X86::CMOV_V8F64:
	case X86::CMOV_V8I64:
	case X86::CMOV_V8I1:
	case X86::CMOV_V16I1:
	case X86::CMOV_V32I1:
	case X86::CMOV_V64I1:
	return true;

	default:
	return false;
	}
	}

	// Helper function, which inserts PHI functions into SinkMBB:
	// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
	// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
	// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
	// the last PHI function inserted.
	static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
	MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
	MachineBasicBlock TrueMBB, MachineBasicBlock FalseMBB,
	MachineBasicBlock *SinkMBB) {
	MachineFunction *MF = TrueMBB->getParent();
	const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
	DebugLoc DL = MIItBegin->getDebugLoc();

	X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

	MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

	// As we are creating the PHIs, we have to be careful if there is more than
	// one. Later CMOVs may reference the results of earlier CMOVs, but later
	// PHIs have to reference the individual true/false inputs from earlier PHIs.
	// That also means that PHI construction must work forward from earlier to
	// later, and that the code must maintain a mapping from earlier PHI's
	// destination registers, and the registers that went into the PHI.
	DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
	MachineInstrBuilder MIB;

	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
	unsigned DestReg = MIIt->getOperand(0).getReg();
	unsigned Op1Reg = MIIt->getOperand(1).getReg();
	unsigned Op2Reg = MIIt->getOperand(2).getReg();

	// If this CMOV we are generating is the opposite condition from
	// the jump we generated, then we have to swap the operands for the
	// PHI that is going to be generated.
	if (MIIt->getOperand(3).getImm() == OppCC)
	std::swap(Op1Reg, Op2Reg);

	if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
	Op1Reg = RegRewriteTable[Op1Reg].first;

	if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
	Op2Reg = RegRewriteTable[Op2Reg].second;

	MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(FalseMBB)
	.addReg(Op2Reg)
	.addMBB(TrueMBB);

	// Add this PHI to the rewrite table.
	RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
	}

	return MIB;
	}

	// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
	MachineInstr &SecondCascadedCMOV,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = FirstCMOV.getDebugLoc();

	// We lower cascaded CMOVs such as
	//
	// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
	//
	// to two successive branches.
	//
	// Without this, we would add a PHI between the two jumps, which ends up
	// creating a few copies all around. For instance, for
	//
	// (sitofp (zext (fcmp une)))
	//
	// we would generate:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// movaps %xmm0, %xmm1
	// jne .LBB5_2
	// xorps %xmm1, %xmm1
	// .LBB5_2:
	// jp .LBB5_4
	// movaps %xmm1, %xmm0
	// .LBB5_4:
	// retq
	//
	// because this custom-inserter would have generated:
	//
	// A
	// \| \
	// \| B
	// \| /
	// C
	// \| \
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// B: empty
	// C: Z = PHI [X, A], [Y, B]
	// D: empty
	// E: PHI [X, C], [Z, D]
	//
	// If we lower both CMOVs in a single step, we can instead generate:
	//
	// A
	// \| \
	// \| C
	// \| /\|
	// \|/ \|
	// \| \|
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// D: empty
	// E: PHI [X, A], [X, C], [Y, D]
	//
	// Which, in our sitofp/fcmp example, gives us something like:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// jne .LBB5_4
	// jp .LBB5_4
	// xorps %xmm0, %xmm0
	// .LBB5_4:
	// retq
	//

	// We lower cascaded CMOV into two successive branches to the same block.
	// EFLAGS is used by both, so mark it as live in the second.
	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FirstInsertedMBB);
	F->insert(It, SecondInsertedMBB);
	F->insert(It, SinkMBB);

	// For a cascaded CMOV, we lower it to two successive branches to
	// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
	// the FirstInsertedMBB.
	FirstInsertedMBB->addLiveIn(X86::EFLAGS);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
	SecondInsertedMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->begin(), ThisMBB,
	std::next(MachineBasicBlock::iterator(FirstCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FirstInsertedMBB);
	// The true block target of the first branch is always SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
	// The true block for the branch of FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SinkMBB);
	// This is fallthrough.
	SecondInsertedMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instructions.
	X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
	unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
	BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);

	X86::CondCode SecondCC =
	X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
	unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
	BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);

	// SinkMBB:
	// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
	unsigned DestReg = FirstCMOV.getOperand(0).getReg();
	unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
	unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
	MachineInstrBuilder MIB =
	BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(SecondInsertedMBB)
	.addReg(Op2Reg)
	.addMBB(ThisMBB);

	// The second SecondInsertedMBB provides the same incoming value as the
	// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
	MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
	// Copy the PHI result to the register defined by the second CMOV.
	BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
	TII->get(TargetOpcode::COPY),
	SecondCascadedCMOV.getOperand(0).getReg())
	.addReg(FirstCMOV.getOperand(0).getReg());

	// Now remove the CMOVs.
	FirstCMOV.eraseFromParent();
	SecondCascadedCMOV.eraseFromParent();

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	// To "insert" a SELECT_CC instruction, we actually have to insert the
	// diamond control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch on, the
	// true/false values to select between and a branch opcode to use.

	// ThisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> FalseMBB

	// This code lowers all pseudo-CMOV instructions. Generally it lowers these
	// as described above, by inserting a BB, and then making a PHI at the join
	// point to select the true and false operands of the CMOV in the PHI.
	//
	// The code also handles two different cases of multiple CMOV opcodes
	// in a row.
	//
	// Case 1:
	// In this case, there are multiple CMOVs in a row, all which are based on
	// the same condition setting (or the exact opposite condition setting).
	// In this case we can lower all the CMOVs using a single inserted BB, and
	// then make a number of PHIs at the join point to model the CMOVs. The only
	// trickiness here, is that in a case like:
	//
	// t2 = CMOV cond1 t1, f1
	// t3 = CMOV cond1 t2, f2
	//
	// when rewriting this into PHIs, we have to perform some renaming on the
	// temps since you cannot have a PHI operand refer to a PHI result earlier
	// in the same block. The "simple" but wrong lowering would be:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t2(BB1), f2(BB2)
	//
	// but clearly t2 is not defined in BB1, so that is incorrect. The proper
	// renaming is to note that on the path through BB1, t2 is really just a
	// copy of t1, and do that renaming, properly generating:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t1(BB1), f2(BB2)
	//
	// Case 2:
	// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
	// function - EmitLoweredCascadedSelect.

	X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
	MachineInstr *LastCMOV = &MI;
	MachineBasicBlock::iterator NextMIIt =
	std::next(MachineBasicBlock::iterator(MI));

	// Check for case 1, where there are multiple CMOVs with the same condition
	// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
	// number of jumps the most.

	if (isCMOVPseudo(MI)) {
	// See if we have a string of CMOVS with the same condition.
	while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
	(NextMIIt->getOperand(3).getImm() == CC \|\|
	NextMIIt->getOperand(3).getImm() == OppCC)) {
	LastCMOV = &*NextMIIt;
	++NextMIIt;
	}
	}

	// This checks for case 2, but only do this if we didn't already find
	// case 1, as indicated by LastCMOV == MI.
	if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
	NextMIIt->getOpcode() == MI.getOpcode() &&
	NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
	NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
	NextMIIt->getOperand(1).isKill()) {
	return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
	}

	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FalseMBB);
	F->insert(It, SinkMBB);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!LastCMOV->killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
	FalseMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->begin(), ThisMBB,
	std::next(MachineBasicBlock::iterator(LastCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FalseMBB);
	// The true block target of the first (or only) branch is always a SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FalseMBB.
	FalseMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instruction.
	unsigned Opc = X86::GetCondBranchFromCond(CC);
	BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);

	// SinkMBB:
	// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
	// ...
	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
	MachineBasicBlock::iterator MIItEnd =
	std::next(MachineBasicBlock::iterator(LastCMOV));
	createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

	// Now remove the CMOV(s).
	ThisMBB->erase(MIItBegin, MIItEnd);

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// Combine the following atomic floating-point modification pattern:
	// a.store(reg OP a.load(acquire), release)
	// Transform them into:
	// OPss (%gpr), %xmm
	// movss %xmm, (%gpr)
	// Or sd equivalent for 64-bit operations.
	unsigned MOp, FOp;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
	case X86::RELEASE_FADD32mr:
	FOp = X86::ADDSSrm;
	MOp = X86::MOVSSmr;
	break;
	case X86::RELEASE_FADD64mr:
	FOp = X86::ADDSDrm;
	MOp = X86::MOVSDmr;
	break;
	}
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
	unsigned ValOpIdx = X86::AddrNumOperands;
	unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(FOp),
	MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
	.addReg(VSrc);
	for (int i = 0; i < X86::AddrNumOperands; ++i) {
	MachineOperand &Operand = MI.getOperand(i);
	// Clear any kill flags on register operands as we'll create a second
	// instruction using the same address operands.
	if (Operand.isReg())
	Operand.setIsKill(false);
	MIB.add(Operand);
	}
	MachineInstr *FOpMI = MIB;
	MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));
	MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();

	assert(MF->shouldSplitStack());

	const bool Is64Bit = Subtarget.is64Bit();
	const bool IsLP64 = Subtarget.isTarget64BitLP64();

	const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
	const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

	// BB:
	// ... [Till the alloca]
	// If stacklet is not large enough, jump to mallocMBB
	//
	// bumpMBB:
	// Allocate by subtracting from RSP
	// Jump to continueMBB
	//
	// mallocMBB:
	// Allocate by call to runtime
	//
	// continueMBB:
	// ...
	// [rest of original BB]
	//

	MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	const TargetRegisterClass *AddrRegClass =
	getRegClassFor(getPointerTy(MF->getDataLayout()));

	unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
	SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
	sizeVReg = MI.getOperand(1).getReg(),
	physSPReg =
	IsLP64 \|\| Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

	MachineFunction::iterator MBBIter = ++BB->getIterator();

	MF->insert(MBBIter, bumpMBB);
	MF->insert(MBBIter, mallocMBB);
	MF->insert(MBBIter, continueMBB);

	continueMBB->splice(continueMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	continueMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Add code to the main basic block to check if the stack limit has been hit,
	// and if so, jump to mallocMBB otherwise to bumpMBB.
	BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
	.addReg(tmpSPVReg).addReg(sizeVReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
	.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
	.addReg(SPLimitVReg);
	BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);

	// bumpMBB simply decreases the stack pointer, since we know the current
	// stacklet has enough space.
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Calls into a routine in libgcc to allocate more space from the heap.
	const uint32_t *RegMask =
	Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
	if (IsLP64) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::RDI, RegState::Implicit)
	.addReg(X86::RAX, RegState::ImplicitDefine);
	} else if (Is64Bit) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EDI, RegState::Implicit)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	} else {
	BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
	.addImm(12);
	BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	}

	if (!Is64Bit)
	BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
	.addImm(16);

	BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
	.addReg(IsLP64 ? X86::RAX : X86::EAX);
	BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Set up the CFG correctly.
	BB->addSuccessor(bumpMBB);
	BB->addSuccessor(mallocMBB);
	mallocMBB->addSuccessor(continueMBB);
	bumpMBB->addSuccessor(continueMBB);

	// Take care of the PHI nodes.
	BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
	MI.getOperand(0).getReg())
	.addReg(mallocPtrVReg)
	.addMBB(mallocMBB)
	.addReg(bumpSPPtrVReg)
	.addMBB(bumpMBB);

	// Delete the original pseudo instruction.
	MI.eraseFromParent();

	// And we're done.
	return continueMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
	DebugLoc DL = MI.getDebugLoc();

	assert(!isAsynchronousEHPersonality(
	classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
	"SEH does not use catchret!");

	// Only 32-bit EH needs to worry about manually restoring stack pointers.
	if (!Subtarget.is32Bit())
	return BB;

	// C++ EH creates a new target block to hold the restore code, and wires up
	// the new block to the return destination with a normal JMP_4.
	MachineBasicBlock *RestoreMBB =
	MF->CreateMachineBasicBlock(BB->getBasicBlock());
	assert(BB->succ_size() == 1);
	MF->insert(std::next(BB->getIterator()), RestoreMBB);
	RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
	BB->addSuccessor(RestoreMBB);
	MI.getOperand(0).setMBB(RestoreMBB);

	auto RestoreMBBI = RestoreMBB->begin();
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const Constant *PerFn = MF->getFunction().getPersonalityFn();
	bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
	// Only 32-bit SEH requires special handling for catchpad.
	if (IsSEH && Subtarget.is32Bit()) {
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
	}
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// So, here we replace TLSADDR with the sequence:
	// adjust_stackdown -> TLSADDR -> adjust_stackup.
	// We need this because TLSADDR is lowered into calls
	// inside MC, therefore without the two markers shrink-wrapping
	// may push the prologue/epilogue pass them.
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction &MF = *BB->getParent();

	// Emit CALLSEQ_START right before the instruction.
	unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
	MachineInstrBuilder CallseqStart =
	BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
	BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

	// Emit CALLSEQ_END right after the instruction.
	// We don't call erase from parent because we want to keep the
	// original instruction around.
	unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
	MachineInstrBuilder CallseqEnd =
	BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
	BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// This is pretty easy. We're taking the value that we received from
	// our load from the relocation, sticking it in either RDI (x86-64)
	// or EAX and doing an indirect call. The return value will then
	// be in the normal return register.
	MachineFunction *F = BB->getParent();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
	assert(MI.getOperand(3).isGlobal() && "This should be a global");

	// Get a register mask for the lowered call.
	// FIXME: The 32-bit calls have non-standard calling conventions. Use a
	// proper register mask.
	const uint32_t *RegMask =
	Subtarget.is64Bit() ?
	Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
	Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
	if (Subtarget.is64Bit()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
	addDirectMem(MIB, X86::RDI);
	MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else if (!isPositionIndependent()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(0)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(TII->getGlobalBaseReg(F))
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	+static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
	+ switch (RPOpc) {
	+ case X86::RETPOLINE_CALL32:
	+ return X86::CALLpcrel32;
	+ case X86::RETPOLINE_CALL64:
	+ return X86::CALL64pcrel32;
	+ case X86::RETPOLINE_TCRETURN32:
	+ return X86::TCRETURNdi;
	+ case X86::RETPOLINE_TCRETURN64:
	+ return X86::TCRETURNdi64;
	+ }
	+ llvm_unreachable("not retpoline opcode");
	+}
	+
	+static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
	+ unsigned Reg) {
	+ switch (Reg) {
	+ case 0:
	+ assert(!Subtarget.is64Bit() && "R11 should always be available on x64");
	+ return Subtarget.useRetpolineExternalThunk()
	+ ? "__llvm_external_retpoline_push"
	+ : "__llvm_retpoline_push";
	+ case X86::EAX:
	+ return Subtarget.useRetpolineExternalThunk()
	+ ? "__llvm_external_retpoline_eax"
	+ : "__llvm_retpoline_eax";
	+ case X86::ECX:
	+ return Subtarget.useRetpolineExternalThunk()
	+ ? "__llvm_external_retpoline_ecx"
	+ : "__llvm_retpoline_ecx";
	+ case X86::EDX:
	+ return Subtarget.useRetpolineExternalThunk()
	+ ? "__llvm_external_retpoline_edx"
	+ : "__llvm_retpoline_edx";
	+ case X86::R11:
	+ return Subtarget.useRetpolineExternalThunk()
	+ ? "__llvm_external_retpoline_r11"
	+ : "__llvm_retpoline_r11";
	+ }
	+ llvm_unreachable("unexpected reg for retpoline");
	+}
	+
	MachineBasicBlock *
	+X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
	+ MachineBasicBlock *BB) const {
	+ // Copy the virtual register into the R11 physical register and
	+ // call the retpoline thunk.
	+ DebugLoc DL = MI.getDebugLoc();
	+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
	+ unsigned CalleeVReg = MI.getOperand(0).getReg();
	+ unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
	+
	+ // Find an available scratch register to hold the callee. On 64-bit, we can
	+ // just use R11, but we scan for uses anyway to ensure we don't generate
	+ // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
	+ // already a register use operand to the call to hold the callee. If none
	+ // are available, push the callee instead. This is less efficient, but is
	+ // necessary for functions using 3 regparms. Such function calls are
	+ // (currently) not eligible for tail call optimization, because there is no
	+ // scratch register available to hold the address of the callee.
	+ SmallVector<unsigned, 3> AvailableRegs;
	+ if (Subtarget.is64Bit())
	+ AvailableRegs.push_back(X86::R11);
	+ else
	+ AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX});
	+
	+ // Zero out any registers that are already used.
	+ for (const auto &MO : MI.operands()) {
	+ if (MO.isReg() && MO.isUse())
	+ for (unsigned &Reg : AvailableRegs)
	+ if (Reg == MO.getReg())
	+ Reg = 0;
	+ }
	+
	+ // Choose the first remaining non-zero available register.
	+ unsigned AvailableReg = 0;
	+ for (unsigned MaybeReg : AvailableRegs) {
	+ if (MaybeReg) {
	+ AvailableReg = MaybeReg;
	+ break;
	+ }
	+ }
	+
	+ const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
	+
	+ if (AvailableReg == 0) {
	+ // No register available. Use PUSH. This must not be a tailcall, and this
	+ // must not be x64.
	+ if (Subtarget.is64Bit())
	+ report_fatal_error(
	+ "Cannot make an indirect call on x86-64 using both retpoline and a "
	+ "calling convention that preservers r11");
	+ if (Opc != X86::CALLpcrel32)
	+ report_fatal_error("Cannot make an indirect tail call on x86 using "
	+ "retpoline without a preserved register");
	+ BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg);
	+ MI.getOperand(0).ChangeToES(Symbol);
	+ MI.setDesc(TII->get(Opc));
	+ } else {
	+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
	+ .addReg(CalleeVReg);
	+ MI.getOperand(0).ChangeToES(Symbol);
	+ MI.setDesc(TII->get(Opc));
	+ MachineInstrBuilder(*BB->getParent(), &MI)
	+ .addReg(AvailableReg, RegState::Implicit \| RegState::Kill);
	+ }
	+ return BB;
	+}
	+
	+MachineBasicBlock *
	X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// Memory Reference
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	unsigned DstReg;
	unsigned MemOpndSlot = 0;

	unsigned CurOp = 0;

	DstReg = MI.getOperand(CurOp++).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
	(void)TRI;
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned restoreDstReg = MRI.createVirtualRegister(RC);

	MemOpndSlot = CurOp;

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	// For v = setjmp(buf), we generate
	//
	// thisMBB:
	// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
	// SjLjSetup restoreMBB
	//
	// mainMBB:
	// v_main = 0
	//
	// sinkMBB:
	// v = phi(main, restore)
	//
	// restoreMBB:
	// if base pointer being used, load it from frame
	// v_restore = 1

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, sinkMBB);
	MF->push_back(restoreMBB);
	restoreMBB->setHasAddressTaken();

	MachineInstrBuilder MIB;

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// thisMBB:
	unsigned PtrStoreOpc = 0;
	unsigned LabelReg = 0;
	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	// Prepare IP either in reg or imm.
	if (!UseImmLabel) {
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	LabelReg = MRI.createVirtualRegister(PtrRC);
	if (Subtarget.is64Bit()) {
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB)
	.addReg(0);
	} else {
	const X86InstrInfo XII = static_cast<const X86InstrInfo>(TII);
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
	.addReg(XII->getGlobalBaseReg(MF))
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}
	} else
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	// Store IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	if (!UseImmLabel)
	MIB.addReg(LabelReg);
	else
	MIB.addMBB(restoreMBB);
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Setup
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
	.addMBB(restoreMBB);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	MIB.addRegMask(RegInfo->getNoPreservedMask());
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(restoreMBB);

	// mainMBB:
	// EAX = 0
	BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
	mainMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
	TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(restoreDstReg).addMBB(restoreMBB);

	// restoreMBB:
	if (RegInfo->hasBasePointer(*MF)) {
	const bool Uses64BitFramePtr =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
	X86FI->setRestoreBasePointer(MF);
	unsigned FramePtr = RegInfo->getFrameRegister(*MF);
	unsigned BasePtr = RegInfo->getBaseRegister();
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
	FramePtr, true, X86FI->getRestoreBasePointerOffset())
	.setMIFlag(MachineInstr::FrameSetup);
	}
	BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
	BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	restoreMBB->addSuccessor(sinkMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	const TargetRegisterClass *RC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	unsigned Tmp = MRI.createVirtualRegister(RC);
	// Since FP is only updated here but NOT referenced, it's treated as GPR.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
	unsigned SP = RegInfo->getStackRegister();

	MachineInstrBuilder MIB;

	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t SPOffset = 2 * PVT.getStoreSize();

	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

	// Reload FP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Reload IP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), LabelOffset);
	else
	MIB.add(MI.getOperand(i));
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Reload SP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), SPOffset);
	else
	MIB.add(MI.getOperand(i));
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Jump
	BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

	MI.eraseFromParent();
	return MBB;
	}

	void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
	MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB,
	int FI) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!");

	unsigned Op = 0;
	unsigned VR = 0;

	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	if (UseImmLabel) {
	Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	} else {
	const TargetRegisterClass *TRC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	VR = MRI->createVirtualRegister(TRC);
	Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

	if (Subtarget.is64Bit())
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB)
	.addReg(0);
	else
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
	.addReg(0) /* TII->getGlobalBaseReg(MF) */
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}

	MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
	addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
	if (UseImmLabel)
	MIB.addMBB(DispatchBB);
	else
	MIB.addReg(VR);
	}

	MachineBasicBlock *
	X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = BB->getParent();
	MachineFrameInfo &MFI = MF->getFrameInfo();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	int FI = MFI.getFunctionContextIndex();

	// Get a mapping of the call site numbers to all of the landing pads they're
	// associated with.
	DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
	unsigned MaxCSNum = 0;
	for (auto &MBB : *MF) {
	if (!MBB.isEHPad())
	continue;

	MCSymbol *Sym = nullptr;
	for (const auto &MI : MBB) {
	if (MI.isDebugValue())
	continue;

	assert(MI.isEHLabel() && "expected EH_LABEL");
	Sym = MI.getOperand(0).getMCSymbol();
	break;
	}

	if (!MF->hasCallSiteLandingPad(Sym))
	continue;

	for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
	CallSiteNumToLPad[CSI].push_back(&MBB);
	MaxCSNum = std::max(MaxCSNum, CSI);
	}
	}

	// Get an ordered list of the machine basic blocks for the jump table.
	std::vector<MachineBasicBlock *> LPadList;
	SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
	LPadList.reserve(CallSiteNumToLPad.size());

	for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
	for (auto &LP : CallSiteNumToLPad[CSI]) {
	LPadList.push_back(LP);
	InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
	}
	}

	assert(!LPadList.empty() &&
	"No landing pad destinations for the dispatch jump table!");

	// Create the MBBs for the dispatch code.

	// Shove the dispatch's address into the return slot in the function context.
	MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
	DispatchBB->setIsEHPad(true);

	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
	BuildMI(TrapBB, DL, TII->get(X86::TRAP));
	DispatchBB->addSuccessor(TrapBB);

	MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
	DispatchBB->addSuccessor(DispContBB);

	// Insert MBBs.
	MF->push_back(DispatchBB);
	MF->push_back(DispContBB);
	MF->push_back(TrapBB);

	// Insert code into the entry block that creates and registers the function
	// context.
	SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

	// Create the jump table and associated information
	unsigned JTE = getJumpTableEncoding();
	MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
	unsigned MJTI = JTI->createJumpTableIndex(LPadList);

	const X86RegisterInfo &RI = TII->getRegisterInfo();
	// Add a register mask with no preserved registers. This results in all
	// registers being marked as clobbered.
	if (RI.hasBasePointer(*MF)) {
	const bool FPIs64Bit =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
	MFI->setRestoreBasePointer(MF);

	unsigned FP = RI.getFrameRegister(*MF);
	unsigned BP = RI.getBaseRegister();
	unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
	MFI->getRestoreBasePointerOffset())
	.addRegMask(RI.getNoPreservedMask());
	} else {
	BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
	.addRegMask(RI.getNoPreservedMask());
	}

	// IReg is used as an index in a memory operand and therefore can't be SP
	unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
	addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
	Subtarget.is64Bit() ? 8 : 4);
	BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
	.addReg(IReg)
	.addImm(LPadList.size());
	BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);

	if (Subtarget.is64Bit()) {
	unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
	unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

	// leaq .LJTI0_0(%rip), BReg
	BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	// movzx IReg64, IReg
	BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
	.addImm(0)
	.addReg(IReg)
	.addImm(X86::sub_32bit);

	switch (JTE) {
	case MachineJumpTableInfo::EK_BlockAddress:
	// jmpq *(BReg,IReg64,8)
	BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
	.addReg(BReg)
	.addImm(8)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	break;
	case MachineJumpTableInfo::EK_LabelDifference32: {
	unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
	unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
	unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

	// movl (BReg,IReg64,4), OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
	.addReg(BReg)
	.addImm(4)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	// movsx OReg64, OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
	// addq BReg, OReg64, TReg
	BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
	.addReg(OReg64)
	.addReg(BReg);
	// jmpq *TReg
	BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
	break;
	}
	default:
	llvm_unreachable("Unexpected jump table encoding");
	}
	} else {
	// jmpl *.LJTI0_0(,IReg,4)
	BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
	.addReg(0)
	.addImm(4)
	.addReg(IReg)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	}

	// Add the jump table entries as successors to the MBB.
	SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
	for (auto &LP : LPadList)
	if (SeenMBBs.insert(LP).second)
	DispContBB->addSuccessor(LP);

	// N.B. the order the invoke BBs are processed in doesn't matter here.
	SmallVector<MachineBasicBlock *, 64> MBBLPads;
	const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
	for (MachineBasicBlock *MBB : InvokeBBs) {
	// Remove the landing pad successor from the invoke block and replace it
	// with the new dispatch block.
	// Keep a copy of Successors since it's modified inside the loop.
	SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
	MBB->succ_rend());
	// FIXME: Avoid quadratic complexity.
	for (auto MBBS : Successors) {
	if (MBBS->isEHPad()) {
	MBB->removeSuccessor(MBBS);
	MBBLPads.push_back(MBBS);
	}
	}

	MBB->addSuccessor(DispatchBB);

	// Find the invoke call and mark all of the callee-saved registers as
	// 'implicit defined' so that they're spilled. This prevents code from
	// moving instructions to before the EH block, where they will never be
	// executed.
	for (auto &II : reverse(*MBB)) {
	if (!II.isCall())
	continue;

	DenseMap<unsigned, bool> DefRegs;
	for (auto &MOp : II.operands())
	if (MOp.isReg())
	DefRegs[MOp.getReg()] = true;

	MachineInstrBuilder MIB(*MF, &II);
	for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
	unsigned Reg = SavedRegs[RI];
	if (!DefRegs[Reg])
	MIB.addReg(Reg, RegState::ImplicitDefine \| RegState::Dead);
	}

	break;
	}
	}

	// Mark all former landing pads as non-landing pads. The dispatch is the only
	// landing pad now.
	for (auto &LP : MBBLPads)
	LP->setIsEHPad(false);

	// The instruction is gone now.
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unexpected instr type to insert");
	case X86::TAILJMPd64:
	case X86::TAILJMPr64:
	case X86::TAILJMPm64:
	case X86::TAILJMPr64_REX:
	case X86::TAILJMPm64_REX:
	llvm_unreachable("TAILJMP64 would not be touched here.");
	case X86::TCRETURNdi64:
	case X86::TCRETURNri64:
	case X86::TCRETURNmi64:
	return BB;
	case X86::TLS_addr32:
	case X86::TLS_addr64:
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	return EmitLoweredTLSAddr(MI, BB);
	+ case X86::RETPOLINE_CALL32:
	+ case X86::RETPOLINE_CALL64:
	+ case X86::RETPOLINE_TCRETURN32:
	+ case X86::RETPOLINE_TCRETURN64:
	+ return EmitLoweredRetpoline(MI, BB);
	case X86::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	case X86::CATCHPAD:
	return EmitLoweredCatchPad(MI, BB);
	case X86::SEG_ALLOCA_32:
	case X86::SEG_ALLOCA_64:
	return EmitLoweredSegAlloca(MI, BB);
	case X86::TLSCall_32:
	case X86::TLSCall_64:
	return EmitLoweredTLSCall(MI, BB);
	case X86::CMOV_FR32:
	case X86::CMOV_FR64:
	case X86::CMOV_FR128:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_V2F64:
	case X86::CMOV_V2I64:
	case X86::CMOV_V4F32:
	case X86::CMOV_V4F64:
	case X86::CMOV_V4I64:
	case X86::CMOV_V16F32:
	case X86::CMOV_V8F32:
	case X86::CMOV_V8F64:
	case X86::CMOV_V8I64:
	case X86::CMOV_V8I1:
	case X86::CMOV_V16I1:
	case X86::CMOV_V32I1:
	case X86::CMOV_V64I1:
	return EmitLoweredSelect(MI, BB);

	case X86::RDFLAGS32:
	case X86::RDFLAGS64: {
	unsigned PushF =
	MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
	unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
	MachineInstr Push = BuildMI(BB, MI, DL, TII->get(PushF));
	// Permit reads of the FLAGS register without it being defined.
	// This intrinsic exists to read external processor state in flags, such as
	// the trap flag, interrupt flag, and direction flag, none of which are
	// modeled by the backend.
	Push->getOperand(2).setIsUndef();
	BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::WRFLAGS32:
	case X86::WRFLAGS64: {
	unsigned Push =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
	unsigned PopF =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
	BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
	BuildMI(*BB, MI, DL, TII->get(PopF));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::RELEASE_FADD32mr:
	case X86::RELEASE_FADD64mr:
	return EmitLoweredAtomicFP(MI, BB);

	case X86::FP32_TO_INT16_IN_MEM:
	case X86::FP32_TO_INT32_IN_MEM:
	case X86::FP32_TO_INT64_IN_MEM:
	case X86::FP64_TO_INT16_IN_MEM:
	case X86::FP64_TO_INT32_IN_MEM:
	case X86::FP64_TO_INT64_IN_MEM:
	case X86::FP80_TO_INT16_IN_MEM:
	case X86::FP80_TO_INT32_IN_MEM:
	case X86::FP80_TO_INT64_IN_MEM: {
	// Change the floating point control register to use "round towards zero"
	// mode when truncating to an integer value.
	int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FNSTCW16m)), CWFrameIdx);

	// Load the old value of the high byte of the control word...
	unsigned OldCW =
	MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
	CWFrameIdx);

	// Set the high part to be round to zero...
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
	.addImm(0xC7F);

	// Reload the modified control word now...
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), CWFrameIdx);

	// Restore the memory image of control word to original value
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
	.addReg(OldCW);

	// Get the X86 opcode to use.
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
	case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
	case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
	case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
	case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
	case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
	case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
	case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
	case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
	}

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
	.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

	// Reload the original control word now.
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), CWFrameIdx);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}
	// String/text processing lowering.
	case X86::PCMPISTRM128REG:
	case X86::VPCMPISTRM128REG:
	case X86::PCMPISTRM128MEM:
	case X86::VPCMPISTRM128MEM:
	case X86::PCMPESTRM128REG:
	case X86::VPCMPESTRM128REG:
	case X86::PCMPESTRM128MEM:
	case X86::VPCMPESTRM128MEM:
	assert(Subtarget.hasSSE42() &&
	"Target must have SSE4.2 or AVX features enabled");
	return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());

	// String/text processing lowering.
	case X86::PCMPISTRIREG:
	case X86::VPCMPISTRIREG:
	case X86::PCMPISTRIMEM:
	case X86::VPCMPISTRIMEM:
	case X86::PCMPESTRIREG:
	case X86::VPCMPESTRIREG:
	case X86::PCMPESTRIMEM:
	case X86::VPCMPESTRIMEM:
	assert(Subtarget.hasSSE42() &&
	"Target must have SSE4.2 or AVX features enabled");
	return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());

	// Thread synchronization.
	case X86::MONITOR:
	return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
	case X86::MONITORX:
	return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);

	// Cache line zero
	case X86::CLZERO:
	return emitClzero(&MI, BB, Subtarget);

	// PKU feature
	case X86::WRPKRU:
	return emitWRPKRU(MI, BB, Subtarget);
	case X86::RDPKRU:
	return emitRDPKRU(MI, BB, Subtarget);
	// xbegin
	case X86::XBEGIN:
	return emitXBegin(MI, BB, Subtarget.getInstrInfo());

	case X86::VASTART_SAVE_XMM_REGS:
	return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

	case X86::VAARG_64:
	return EmitVAARG64WithCustomInserter(MI, BB);

	case X86::EH_SjLj_SetJmp32:
	case X86::EH_SjLj_SetJmp64:
	return emitEHSjLjSetJmp(MI, BB);

	case X86::EH_SjLj_LongJmp32:
	case X86::EH_SjLj_LongJmp64:
	return emitEHSjLjLongJmp(MI, BB);

	case X86::Int_eh_sjlj_setup_dispatch:
	return EmitSjLjDispatchBlock(MI, BB);

	case TargetOpcode::STATEPOINT:
	// As an implementation detail, STATEPOINT shares the STACKMAP format at
	// this point in the process. We diverge later.
	return emitPatchPoint(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case TargetOpcode::PATCHABLE_EVENT_CALL:
	// Do nothing here, handle in xray instrumentation pass.
	return BB;

	case X86::LCMPXCHG8B: {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
	// requires a memory operand. If it happens that current architecture is
	// i686 and for current function we need a base pointer
	// - which is ESI for i686 - register allocator would not be able to
	// allocate registers for an address in form of X(%reg, %reg, Y)
	// - there never would be enough unreserved registers during regalloc
	// (without the need for base ptr the only option would be X(%edi, %esi, Y).
	// We are giving a hand to register allocator by precomputing the address in
	// a new vreg using LEA.

	// If it is not i686 or there is no base pointer - nothing to do here.
	if (!Subtarget.is32Bit() \|\| !TRI->hasBasePointer(*MF))
	return BB;

	// Even though this code does not necessarily needs the base pointer to
	// be ESI, we check for that. The reason: if this assert fails, there are
	// some changes happened in the compiler base pointer handling, which most
	// probably have to be addressed somehow here.
	assert(TRI->getBaseRegister() == X86::ESI &&
	"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
	"base pointer in mind");

	MachineRegisterInfo &MRI = MF->getRegInfo();
	MVT SPTy = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	// Regalloc does not need any help when the memory operand of CMPXCHG8B
	// does not use index register.
	if (AM.IndexReg == X86::NoRegister)
	return BB;

	// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
	// four operand definitions that are E[ABCD] registers. We skip them and
	// then insert the LEA.
	MachineBasicBlock::iterator MBBI(MI);
	while (MBBI->definesRegister(X86::EAX) \|\| MBBI->definesRegister(X86::EBX) \|\|
	MBBI->definesRegister(X86::ECX) \|\| MBBI->definesRegister(X86::EDX))
	--MBBI;
	addFullAddress(
	BuildMI(BB, MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

	setDirectAddressInInstr(&MI, 0, computedAddrVReg);

	return BB;
	}
	case X86::LCMPXCHG16B:
	return BB;
	case X86::LCMPXCHG8B_SAVE_EBX:
	case X86::LCMPXCHG16B_SAVE_RBX: {
	unsigned BasePtr =
	MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
	if (!BB->isLiveIn(BasePtr))
	BB->addLiveIn(BasePtr);
	return BB;
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// X86 Optimization Hooks
	//===----------------------------------------------------------------------===//

	void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned BitWidth = Known.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();
	assert((Opc >= ISD::BUILTIN_OP_END \|\|
	Opc == ISD::INTRINSIC_WO_CHAIN \|\|
	Opc == ISD::INTRINSIC_W_CHAIN \|\|
	Opc == ISD::INTRINSIC_VOID) &&
	"Should use MaskedValueIsZero if you don't know whether Op"
	" is a target node!");

	Known.resetAll();
	switch (Opc) {
	default: break;
	case X86ISD::SETCC:
	Known.Zero.setBitsFrom(1);
	break;
	case X86ISD::MOVMSK: {
	unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
	Known.Zero.setBitsFrom(NumLoBits);
	break;
	}
	case X86ISD::PEXTRB:
	case X86ISD::PEXTRW: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
	Op.getConstantOperandVal(1));
	DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
	Known = Known.zextOrTrunc(BitWidth);
	Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
	break;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
	Known.setAllZero();
	break;
	}

	DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	unsigned ShAmt = ShiftImm->getZExtValue();
	if (Opc == X86ISD::VSHLI) {
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;
	// Low bits are known zero.
	Known.Zero.setLowBits(ShAmt);
	} else {
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);
	// High bits are known zero.
	Known.Zero.setHighBits(ShAmt);
	}
	}
	break;
	}
	case X86ISD::VZEXT: {
	// TODO: Add DemandedElts support.
	SDValue N0 = Op.getOperand(0);
	unsigned NumElts = VT.getVectorNumElements();

	EVT SrcVT = N0.getValueType();
	unsigned InNumElts = SrcVT.getVectorNumElements();
	unsigned InBitWidth = SrcVT.getScalarSizeInBits();
	assert(InNumElts >= NumElts && "Illegal VZEXT input");

	Known = KnownBits(InBitWidth);
	APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
	DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
	Known = Known.zext(BitWidth);
	Known.Zero.setBitsFrom(InBitWidth);
	break;
	}
	case X86ISD::CMOV: {
	DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	KnownBits Known2;
	DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	}
	case X86ISD::UDIVREM8_ZEXT_HREG:
	// TODO: Support more than just the zero extended bits?
	if (Op.getResNo() != 1)
	break;
	// The remainder is zero extended.
	Known.Zero.setBitsFrom(8);
	break;
	}
	}

	unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned VTBits = Op.getScalarValueSizeInBits();
	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::SETCC_CARRY:
	// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
	return VTBits;

	case X86ISD::VSEXT: {
	// TODO: Add DemandedElts support.
	SDValue Src = Op.getOperand(0);
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	Tmp += VTBits - Src.getScalarValueSizeInBits();
	return Tmp;
	}

	case X86ISD::VTRUNC: {
	// TODO: Add DemandedElts support.
	SDValue Src = Op.getOperand(0);
	unsigned NumSrcBits = Src.getScalarValueSizeInBits();
	assert(VTBits < NumSrcBits && "Illegal truncation input type");
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	if (Tmp > (NumSrcBits - VTBits))
	return Tmp - (NumSrcBits - VTBits);
	return 1;
	}

	case X86ISD::PACKSS: {
	// PACKSS is just a truncation if the sign bits extend to the packed size.
	// TODO: Add DemandedElts support.
	unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
	unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
	unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
	unsigned Tmp = std::min(Tmp0, Tmp1);
	if (Tmp > (SrcBits - VTBits))
	return Tmp - (SrcBits - VTBits);
	return 1;
	}

	case X86ISD::VSHLI: {
	SDValue Src = Op.getOperand(0);
	APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
	if (ShiftVal.uge(VTBits))
	return VTBits; // Shifted all bits out --> zero.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	if (ShiftVal.uge(Tmp))
	return 1; // Shifted all sign bits out --> unknown.
	return Tmp - ShiftVal.getZExtValue();
	}

	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
	if (ShiftVal.uge(VTBits - 1))
	return VTBits; // Sign splat.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	ShiftVal += Tmp;
	return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
	}

	case X86ISD::PCMPGT:
	case X86ISD::PCMPEQ:
	case X86ISD::CMPP:
	case X86ISD::VPCOM:
	case X86ISD::VPCOMU:
	// Vector compares return zero/all-bits result values.
	return VTBits;

	case X86ISD::CMOV: {
	unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp0 == 1) return 1; // Early out.
	unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
	return std::min(Tmp0, Tmp1);
	}
	case X86ISD::SDIVREM8_SEXT_HREG:
	// TODO: Support more than just the sign extended bits?
	if (Op.getResNo() != 1)
	break;
	// The remainder is sign extended.
	return VTBits - 7;
	}

	// Fallback case.
	return 1;
	}

	SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
	if (N->getOpcode() == X86ISD::Wrapper \|\| N->getOpcode() == X86ISD::WrapperRIP)
	return N->getOperand(0);
	return N;
	}

	/// Returns true (and the GlobalValue and the offset) if the node is a
	/// GlobalAddress + offset.
	bool X86TargetLowering::isGAPlusOffset(SDNode *N,
	const GlobalValue* &GA,
	int64_t &Offset) const {
	if (N->getOpcode() == X86ISD::Wrapper) {
	if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
	GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
	Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
	return true;
	}
	}
	return TargetLowering::isGAPlusOffset(N, GA, Offset);
	}

	// Attempt to match a combined shuffle mask against supported unary shuffle
	// instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

	// Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
	// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
	unsigned MaxScale = 64 / MaskEltSize;
	for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
	bool Match = true;
	unsigned NumDstElts = NumMaskElts / Scale;
	for (unsigned i = 0; i != NumDstElts && Match; ++i) {
	Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
	Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
	}
	if (Match) {
	unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
	MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
	MVT::getIntegerVT(MaskEltSize);
	SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

	if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
	V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
	Shuffle = unsigned(X86ISD::VZEXT);
	} else
	Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);

	DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
	DstVT = MVT::getVectorVT(DstVT, NumDstElts);
	return true;
	}
	}
	}

	// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
	if (((MaskEltSize == 32) \|\| (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
	isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Check if we have SSE3 which will let us use MOVDDUP etc. The
	// instructions are no slower than UNPCKLPD but has the option to
	// fold the input operand into even an unaligned memory load.
	if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
	if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	if (MaskVT.is256BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v4f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	}

	if (MaskVT.is512BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX512() &&
	"AVX512 required for 512-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v8f64;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	}

	// Attempt to match against broadcast-from-vector.
	if (Subtarget.hasAVX2()) {
	SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
	if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
	SrcVT = DstVT = MaskVT;
	Shuffle = X86ISD::VBROADCAST;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined shuffle mask against supported unary immediate
	// permute instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain,
	bool AllowIntDomain,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned InputSizeInBits = MaskVT.getSizeInBits();
	unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
	MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

	bool ContainsZeros =
	llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
	if (!ContainsZeros && MaskScalarSizeInBits == 64) {
	// Check for lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
	// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
	if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
	PermuteImm = getV4X86ShuffleImm(Mask);
	return true;
	}
	if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
	PermuteImm = getV4X86ShuffleImm(RepeatedMask);
	return true;
	}
	}
	} else if (AllowFloatDomain && Subtarget.hasAVX()) {
	// VPERMILPD can permute with a non-repeating shuffle.
	Shuffle = X86ISD::VPERMILPI;
	ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
	PermuteImm = 0;
	for (int i = 0, e = Mask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
	PermuteImm \|= (M & 1) << i;
	}
	return true;
	}
	}

	// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
	// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
	// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
	if ((MaskScalarSizeInBits == 64 \|\| MaskScalarSizeInBits == 32) &&
	!ContainsZeros && (AllowIntDomain \|\| Subtarget.hasAVX())) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	// Narrow the repeated mask to create 32-bit element permutes.
	SmallVector<int, 4> WordMask = RepeatedMask;
	if (MaskScalarSizeInBits == 64)
	scaleShuffleMask<int>(2, RepeatedMask, WordMask);

	Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
	ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
	PermuteImm = getV4X86ShuffleImm(WordMask);
	return true;
	}
	}

	// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
	if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	ArrayRef<int> LoMask(Mask.data() + 0, 4);
	ArrayRef<int> HiMask(Mask.data() + 4, 4);

	// PSHUFLW: permute lower 4 elements only.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	Shuffle = X86ISD::PSHUFLW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(LoMask);
	return true;
	}

	// PSHUFHW: permute upper 4 elements only.
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	// Offset the HiMask so that we can create the shuffle immediate.
	int OffsetHiMask[4];
	for (int i = 0; i != 4; ++i)
	OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

	Shuffle = X86ISD::PSHUFHW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
	return true;
	}
	}
	}

	// Attempt to match against byte/bit shifts.
	// FIXME: Add 512-bit support.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
	MaskScalarSizeInBits, Mask,
	0, Zeroable, Subtarget);
	if (0 < ShiftAmt) {
	PermuteImm = (unsigned)ShiftAmt;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined unary shuffle mask against supported binary
	// shuffle instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDValue &V2, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
	bool IsUnary) {
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	if (MaskVT.is128BitVector()) {
	if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = X86ISD::MOVLHPS;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = X86ISD::MOVHLPS;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	std::swap(V1, V2);
	Shuffle = X86ISD::MOVSD;
	SrcVT = DstVT = MaskVT;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	Shuffle = X86ISD::MOVSS;
	SrcVT = DstVT = MaskVT;
	return true;
	}
	}

	// Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
	// TODO add support for 256/512-bit types.
	if ((MaskVT == MVT::v8i16 \|\| MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
	if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
	Subtarget)) {
	DstVT = MaskVT;
	return true;
	}
	}

	// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
	if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
	if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
	DAG, Subtarget)) {
	SrcVT = DstVT = MaskVT;
	if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
	SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
	return true;
	}
	}

	return false;
	}

	static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain,
	bool AllowIntDomain,
	SDValue &V1, SDValue &V2, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	// Attempt to match against PALIGNR byte rotate.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
	if (0 < ByteRotation) {
	Shuffle = X86ISD::PALIGNR;
	ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
	PermuteImm = ByteRotation;
	return true;
	}
	}

	// Attempt to combine to X86ISD::BLENDI.
	if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) \|\|
	(Subtarget.hasAVX() && MaskVT.is256BitVector()))) \|\|
	(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
	if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
	BlendMask)) {
	if (MaskVT == MVT::v16i16) {
	// We can only use v16i16 PBLENDW if the lanes are repeated.
	SmallVector<int, 8> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
	RepeatedMask)) {
	assert(RepeatedMask.size() == 8 &&
	"Repeated mask size doesn't match!");
	PermuteImm = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	PermuteImm \|= 1 << i;
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	} else {
	// Determine a type compatible with X86ISD::BLENDI.
	ShuffleVT = MaskVT;
	if (Subtarget.hasAVX2()) {
	if (ShuffleVT == MVT::v4i64)
	ShuffleVT = MVT::v8i32;
	else if (ShuffleVT == MVT::v2i64)
	ShuffleVT = MVT::v4i32;
	} else {
	if (ShuffleVT == MVT::v2i64 \|\| ShuffleVT == MVT::v4i32)
	ShuffleVT = MVT::v8i16;
	else if (ShuffleVT == MVT::v4i64)
	ShuffleVT = MVT::v4f64;
	else if (ShuffleVT == MVT::v8i32)
	ShuffleVT = MVT::v8f32;
	}

	if (!ShuffleVT.isFloatingPoint()) {
	int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
	BlendMask =
	scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
	ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
	}

	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	PermuteImm = (unsigned)BlendMask;
	Shuffle = X86ISD::BLENDI;
	return true;
	}
	}
	}

	// Attempt to combine to INSERTPS.
	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
	MaskVT.is128BitVector()) {
	if (Zeroable.getBoolValue() &&
	matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
	Shuffle = X86ISD::INSERTPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}
	}

	// Attempt to combine to SHUFPD.
	if (AllowFloatDomain && EltSizeInBits == 64 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
	return true;
	}
	}

	// Attempt to combine to SHUFPS.
	if (AllowFloatDomain && EltSizeInBits == 32 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE1()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	SmallVector<int, 4> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
	// Match each half of the repeated mask, to determine if its just
	// referencing one of the vectors, is zeroable or entirely undef.
	auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
	int M0 = RepeatedMask[Offset];
	int M1 = RepeatedMask[Offset + 1];

	if (isUndefInRange(RepeatedMask, Offset, 2)) {
	return DAG.getUNDEF(MaskVT);
	} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : 0);
	S1 = (SM_SentinelUndef == M1 ? -1 : 1);
	return getZeroVector(MaskVT, Subtarget, DAG, DL);
	} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V1;
	} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V2;
	}

	return SDValue();
	};

	int ShufMask[4] = {-1, -1, -1, -1};
	SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
	SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

	if (Lo && Hi) {
	V1 = Lo;
	V2 = Hi;
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
	PermuteImm = getV4X86ShuffleImm(ShufMask);
	return true;
	}
	}
	}

	return false;
	}

	/// \brief Combine an arbitrary chain of shuffles into a single instruction if
	/// possible.
	///
	/// This is the leaf of the recursive combine below. When we have found some
	/// chain of single-use x86 shuffle instructions and accumulated the combined
	/// shuffle mask represented by them, this will try to pattern match that mask
	/// into either a single instruction if there is a special purpose instruction
	/// for this operation, or into a PSHUFB instruction which is a fully general
	/// instruction but should only be used to replace chains over a certain depth.
	static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
	ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
	assert((Inputs.size() == 1 \|\| Inputs.size() == 2) &&
	"Unexpected number of shuffle inputs!");

	// Find the inputs that enter the chain. Note that multiple uses are OK
	// here, we're not going to remove the operands we find.
	bool UnaryShuffle = (Inputs.size() == 1);
	SDValue V1 = peekThroughBitcasts(Inputs[0]);
	SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
	: peekThroughBitcasts(Inputs[1]));

	MVT VT1 = V1.getSimpleValueType();
	MVT VT2 = V2.getSimpleValueType();
	MVT RootVT = Root.getSimpleValueType();
	assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
	VT2.getSizeInBits() == RootVT.getSizeInBits() &&
	"Vector size mismatch");

	SDLoc DL(Root);
	SDValue Res;

	unsigned NumBaseMaskElts = BaseMask.size();
	if (NumBaseMaskElts == 1) {
	assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
	return DAG.getBitcast(RootVT, V1);
	}

	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned NumRootElts = RootVT.getVectorNumElements();
	unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
	bool FloatDomain = VT1.isFloatingPoint() \|\| VT2.isFloatingPoint() \|\|
	(RootVT.is256BitVector() && !Subtarget.hasAVX2());

	// Don't combine if we are a AVX512/EVEX target and the mask element size
	// is different from the root element size - this would prevent writemasks
	// from being reused.
	// TODO - this currently prevents all lane shuffles from occurring.
	// TODO - check for writemasks usage instead of always preventing combining.
	// TODO - attempt to narrow Mask back to writemask size.
	bool IsEVEXShuffle =
	RootSizeInBits == 512 \|\| (Subtarget.hasVLX() && RootSizeInBits >= 128);

	// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.

	// Handle 128-bit lane shuffles of 256-bit vectors.
	// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
	// we need to use the zeroing feature.
	// TODO - this should support binary shuffles.
	if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
	!(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
	!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
	return SDValue(); // Nothing to do!
	MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
	unsigned PermMask = 0;
	PermMask \|= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
	PermMask \|= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

	Res = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
	DAG.getUNDEF(ShuffleVT),
	DAG.getConstant(PermMask, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// For masks that have been widened to 128-bit elements or more,
	// narrow back down to 64-bit elements.
	SmallVector<int, 64> Mask;
	if (BaseMaskEltSizeInBits > 64) {
	assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
	int MaskScale = BaseMaskEltSizeInBits / 64;
	scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
	} else {
	Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
	}

	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

	// Determine the effective mask value type.
	FloatDomain &= (32 <= MaskEltSizeInBits);
	MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
	: MVT::getIntegerVT(MaskEltSizeInBits);
	MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

	// Only allow legal mask types.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
	return SDValue();

	// Attempt to match the mask against known shuffle patterns.
	MVT ShuffleSrcVT, ShuffleVT;
	unsigned Shuffle, PermuteImm;

	// Which shuffle domains are permitted?
	// Permit domain crossing at higher combine depths.
	bool AllowFloatDomain = FloatDomain \|\| (Depth > 3);
	bool AllowIntDomain = (!FloatDomain \|\| (Depth > 3)) && Subtarget.hasSSE2() &&
	(!MaskVT.is256BitVector() \|\| Subtarget.hasAVX2());

	// Determine zeroable mask elements.
	APInt Zeroable(NumMaskElts, 0);
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (isUndefOrZero(Mask[i]))
	Zeroable.setBit(i);

	if (UnaryShuffle) {
	// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
	// directly if we don't shuffle the lower element and we shuffle the upper
	// (zero) elements within themselves.
	if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
	(V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
	unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
	ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
	if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
	isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
	return DAG.getBitcast(RootVT, V1);
	}
	}

	SDValue NewV1 = V1; // Save operand in case early exit happens.
	if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
	NewV1, DL, DAG, Subtarget, Shuffle,
	ShuffleSrcVT, ShuffleVT) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, Subtarget, Shuffle,
	ShuffleVT, PermuteImm) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}
	}

	SDValue NewV1 = V1; // Save operands in case early exit happens.
	SDValue NewV2 = V2;
	if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
	NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
	ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
	DCI.AddToWorklist(NewV1.getNode());
	NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
	DCI.AddToWorklist(NewV2.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	NewV1 = V1; // Save operands in case early exit happens.
	NewV2 = V2;
	if (matchBinaryPermuteVectorShuffle(
	MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
	NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
	DCI.AddToWorklist(NewV1.getNode());
	NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
	DCI.AddToWorklist(NewV2.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// Typically from here on, we need an integer version of MaskVT.
	MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
	IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

	// Annoyingly, SSE4A instructions don't map into the above match helpers.
	if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
	uint64_t BitLen, BitIdx;
	if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
	Zeroable)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(IntMaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}
	}

	// Don't try to re-form single instruction chains under any circumstances now
	// that we've done encoding canonicalization for them.
	if (Depth < 2)
	return SDValue();

	// Depth threshold above which we can efficiently use variable mask shuffles.
	int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
	bool AllowVariableMask = (Depth >= VariableShuffleDepth) \|\| HasVariableMask;

	bool MaskContainsZeros =
	any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
	// If we have a single input lane-crossing shuffle then lower to VPERMV.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX2() &&
	(MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
	// vector as the second source.
	if (UnaryShuffle && AllowVariableMask &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	// Adjust shuffle mask - replace SM_SentinelZero with second source index.
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (Mask[i] == SM_SentinelZero)
	Mask[i] = NumMaskElts + i;

	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
	DCI.AddToWorklist(Zero.getNode());
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
	if (AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	V1 = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(MaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}
	return SDValue();
	}

	// See if we can combine a single input shuffle with zeros to a bit-mask,
	// which is much simpler than any shuffle.
	if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
	isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
	DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
	APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
	APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
	APInt UndefElts(NumMaskElts, 0);
	SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	}
	if (M == SM_SentinelZero)
	continue;
	EltBits[i] = AllOnes;
	}
	SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
	DCI.AddToWorklist(BitMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	unsigned AndOpcode =
	FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
	Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// the 128-bit lanes use the variable mask to VPERMILPS.
	// TODO Combine other mask types at higher depths.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
	SmallVector<SDValue, 16> VPermIdx;
	for (int M : Mask) {
	SDValue Idx =
	M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
	VPermIdx.push_back(Idx);
	}
	SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
	// to VPERMIL2PD/VPERMIL2PS.
	if (AllowVariableMask && Subtarget.hasXOP() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4f32 \|\|
	MaskVT == MVT::v8f32)) {
	// VPERMIL2 Operation.
	// Bits[3] - Match Bit.
	// Bits[2:1] - (Per Lane) PD Shuffle Mask.
	// Bits[2:0] - (Per Lane) PS Shuffle Mask.
	unsigned NumLanes = MaskVT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = NumMaskElts / NumLanes;
	SmallVector<int, 8> VPerm2Idx;
	unsigned M2ZImm = 0;
	for (int M : Mask) {
	if (M == SM_SentinelUndef) {
	VPerm2Idx.push_back(-1);
	continue;
	}
	if (M == SM_SentinelZero) {
	M2ZImm = 2;
	VPerm2Idx.push_back(8);
	continue;
	}
	int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
	Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
	VPerm2Idx.push_back(Index);
	}
	V1 = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(MaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPerm2MaskOp.getNode());
	Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
	DAG.getConstant(M2ZImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have 3 or more shuffle instructions or a chain involving a variable
	// mask, we can replace them with a single PSHUFB instruction profitably.
	// Intel's manuals suggest only using PSHUFB if doing so replacing 5
	// instructions, but in practice PSHUFB tends to be very fast so we're
	// more aggressive.
	if (UnaryShuffle && AllowVariableMask &&
	((RootVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(RootVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
	SmallVector<SDValue, 16> PSHUFBMask;
	int NumBytes = RootVT.getSizeInBits() / 8;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	assert((M / 16) == (i / 16) && "Lane crossing detected");
	PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
	Res = DAG.getBitcast(ByteVT, V1);
	DCI.AddToWorklist(Res.getNode());
	SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
	DCI.AddToWorklist(PSHUFBMaskOp.getNode());
	Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, if we have a 128-bit binary input shuffle we can always combine
	// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
	// slower than PSHUFB on targets that support both.
	if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
	// VPPERM Mask Operation
	// Bits[4:0] - Byte Index (0 - 31)
	// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
	SmallVector<SDValue, 16> VPPERMMask;
	int NumBytes = 16;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::v16i8;
	V1 = DAG.getBitcast(ByteVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(ByteVT, V2);
	DCI.AddToWorklist(V2.getNode());
	SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
	DCI.AddToWorklist(VPPERMMaskOp.getNode());
	Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// Failed to find any combines.
	return SDValue();
	}

	// Attempt to constant fold all of the constant source ops.
	// Returns true if the entire shuffle is folded to a constant.
	// TODO: Extend this to merge multiple constant Ops and update the mask.
	static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
	ArrayRef<int> Mask, SDValue Root,
	bool HasVariableMask,
	SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MVT VT = Root.getSimpleValueType();

	unsigned SizeInBits = VT.getSizeInBits();
	unsigned NumMaskElts = Mask.size();
	unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
	unsigned NumOps = Ops.size();

	// Extract constant bits from each source op.
	bool OneUseConstantOp = false;
	SmallVector<APInt, 16> UndefEltsOps(NumOps);
	SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue SrcOp = Ops[i];
	OneUseConstantOp \|= SrcOp.hasOneUse();
	if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
	RawBitsOps[i]))
	return SDValue();
	}

	// Only fold if at least one of the constants is only used once or
	// the combined shuffle has included a variable mask shuffle, this
	// is to avoid constant pool bloat.
	if (!OneUseConstantOp && !HasVariableMask)
	return SDValue();

	// Shuffle the constant bits according to the mask.
	APInt UndefElts(NumMaskElts, 0);
	APInt ZeroElts(NumMaskElts, 0);
	APInt ConstantElts(NumMaskElts, 0);
	SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
	APInt::getNullValue(MaskSizeInBits));
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	} else if (M == SM_SentinelZero) {
	ZeroElts.setBit(i);
	continue;
	}
	assert(0 <= M && M < (int)(NumMaskElts * NumOps));

	unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
	unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

	auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
	if (SrcUndefElts[SrcMaskIdx]) {
	UndefElts.setBit(i);
	continue;
	}

	auto &SrcEltBits = RawBitsOps[SrcOpIdx];
	APInt &Bits = SrcEltBits[SrcMaskIdx];
	if (!Bits) {
	ZeroElts.setBit(i);
	continue;
	}

	ConstantElts.setBit(i);
	ConstantBitData[i] = Bits;
	}
	assert((UndefElts \| ZeroElts \| ConstantElts).isAllOnesValue());

	// Create the constant data.
	MVT MaskSVT;
	if (VT.isFloatingPoint() && (MaskSizeInBits == 32 \|\| MaskSizeInBits == 64))
	MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
	else
	MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

	MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

	SDLoc DL(Root);
	SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
	DCI.AddToWorklist(CstOp.getNode());
	return DAG.getBitcast(VT, CstOp);
	}

	/// \brief Fully generic combining of x86 shuffle instructions.
	///
	/// This should be the last combine run over the x86 shuffle instructions. Once
	/// they have been fully optimized, this will recursively consider all chains
	/// of single-use shuffle instructions, build a generic model of the cumulative
	/// shuffle operation, and check for simpler instructions which implement this
	/// operation. We use this primarily for two purposes:
	///
	/// 1) Collapse generic shuffles to specialized single instructions when
	/// equivalent. In most cases, this is just an encoding size win, but
	/// sometimes we will collapse multiple generic shuffles into a single
	/// special-purpose shuffle.
	/// 2) Look for sequences of shuffle instructions with 3 or more total
	/// instructions, and replace them with the slightly more expensive SSSE3
	/// PSHUFB instruction if available. We do this as the last combining step
	/// to ensure we avoid using PSHUFB if we can implement the shuffle with
	/// a suitable short sequence of other instructions. The PSHUFB will either
	/// use a register or have to read from memory and so is slightly (but only
	/// slightly) more expensive than the other shuffle instructions.
	///
	/// Because this is inherently a quadratic operation (for each shuffle in
	/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
	/// This should never be an issue in practice as the shuffle lowering doesn't
	/// produce sequences of more than 8 instructions.
	///
	/// FIXME: We will currently miss some cases where the redundant shuffling
	/// would simplify under the threshold for PSHUFB formation because of
	/// combine-ordering. To fix this, we should do the redundant instruction
	/// combining in this recursive walk.
	static SDValue combineX86ShufflesRecursively(
	ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
	ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth,
	bool HasVariableMask, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
	// Bound the depth of our recursive combine because this is ultimately
	// quadratic in nature.
	if (Depth > 8)
	return SDValue();

	// Directly rip through bitcasts to find the underlying operand.
	SDValue Op = SrcOps[SrcOpIndex];
	Op = peekThroughOneUseBitcasts(Op);

	MVT VT = Op.getSimpleValueType();
	if (!VT.isVector())
	return SDValue(); // Bail if we hit a non-vector.

	assert(Root.getSimpleValueType().isVector() &&
	"Shuffles operate on vector types!");
	assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
	"Can only combine shuffles of the same vector register size.");

	// Extract target shuffle mask and resolve sentinels and inputs.
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
	return SDValue();

	assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
	SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
	SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());

	// Add the inputs to the Ops list, avoiding duplicates.
	SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());

	int InputIdx0 = -1, InputIdx1 = -1;
	for (int i = 0, e = Ops.size(); i < e; ++i) {
	SDValue BC = peekThroughBitcasts(Ops[i]);
	if (Input0 && BC == peekThroughBitcasts(Input0))
	InputIdx0 = i;
	if (Input1 && BC == peekThroughBitcasts(Input1))
	InputIdx1 = i;
	}

	if (Input0 && InputIdx0 < 0) {
	InputIdx0 = SrcOpIndex;
	Ops[SrcOpIndex] = Input0;
	}
	if (Input1 && InputIdx1 < 0) {
	InputIdx1 = Ops.size();
	Ops.push_back(Input1);
	}

	assert(((RootMask.size() > OpMask.size() &&
	RootMask.size() % OpMask.size() == 0) \|\|
	(OpMask.size() > RootMask.size() &&
	OpMask.size() % RootMask.size() == 0) \|\|
	OpMask.size() == RootMask.size()) &&
	"The smaller number of elements must divide the larger.");

	// This function can be performance-critical, so we rely on the power-of-2
	// knowledge that we have about the mask sizes to replace div/rem ops with
	// bit-masks and shifts.
	assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
	unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

	unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
	unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
	unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
	assert((RootRatio == 1 \|\| OpRatio == 1) &&
	"Must not have a ratio for both incoming and op masks!");

	assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
	unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

	SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);

	// Merge this shuffle operation's mask into our accumulated mask. Note that
	// this shuffle's mask will be the first applied to the input, followed by the
	// root mask to get us all the way to the root value arrangement. The reason
	// for this order is that we are recursing up the operation chain.
	for (unsigned i = 0; i < MaskWidth; ++i) {
	unsigned RootIdx = i >> RootRatioLog2;
	if (RootMask[RootIdx] < 0) {
	// This is a zero or undef lane, we're done.
	Mask[i] = RootMask[RootIdx];
	continue;
	}

	unsigned RootMaskedIdx =
	RootRatio == 1
	? RootMask[RootIdx]
	: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

	// Just insert the scaled root mask value if it references an input other
	// than the SrcOp we're currently inserting.
	if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) \|\|
	(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
	Mask[i] = RootMaskedIdx;
	continue;
	}

	RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
	unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
	if (OpMask[OpIdx] < 0) {
	// The incoming lanes are zero or undef, it doesn't matter which ones we
	// are using.
	Mask[i] = OpMask[OpIdx];
	continue;
	}

	// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
	unsigned OpMaskedIdx =
	OpRatio == 1
	? OpMask[OpIdx]
	: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));

	OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
	if (OpMask[OpIdx] < (int)OpMask.size()) {
	assert(0 <= InputIdx0 && "Unknown target shuffle input");
	OpMaskedIdx += InputIdx0 * MaskWidth;
	} else {
	assert(0 <= InputIdx1 && "Unknown target shuffle input");
	OpMaskedIdx += InputIdx1 * MaskWidth;
	}

	Mask[i] = OpMaskedIdx;
	}

	// Handle the all undef/zero cases early.
	if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
	return DAG.getUNDEF(Root.getValueType());

	// TODO - should we handle the mixed zero/undef case as well? Just returning
	// a zero mask will lose information on undef elements possibly reducing
	// future combine possibilities.
	if (all_of(Mask, [](int Idx) { return Idx < 0; }))
	return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
	SDLoc(Root));

	// Remove unused shuffle source ops.
	resolveTargetShuffleInputsAndMask(Ops, Mask);
	assert(!Ops.empty() && "Shuffle with no inputs detected");

	HasVariableMask \|= isTargetShuffleVariableMask(Op.getOpcode());

	// Update the list of shuffle nodes that have been combined so far.
	SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
	SrcNodes.end());
	CombinedNodes.push_back(Op.getNode());

	// See if we can recurse into each shuffle source op (if it's a target
	// shuffle). The source op should only be combined if it either has a
	// single use (i.e. current Op) or all its users have already been combined.
	for (int i = 0, e = Ops.size(); i < e; ++i)
	if (Ops[i].getNode()->hasOneUse() \|\|
	SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
	if (SDValue Res = combineX86ShufflesRecursively(
	Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
	DAG, DCI, Subtarget))
	return Res;

	// Attempt to constant fold all of the constant source ops.
	if (SDValue Cst = combineX86ShufflesConstants(
	Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
	return Cst;

	// We can only combine unary and binary shuffle mask cases.
	if (Ops.size() > 2)
	return SDValue();

	// Minor canonicalization of the accumulated shuffle mask to make it easier
	// to match below. All this does is detect masks with sequential pairs of
	// elements, and shrink them to the half-width mask. It does this in a loop
	// so it will reduce the size of the mask to the minimal width mask which
	// performs an equivalent shuffle.
	SmallVector<int, 64> WidenedMask;
	while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
	Mask = std::move(WidenedMask);
	}

	// Canonicalization of binary shuffle masks to improve pattern matching by
	// commuting the inputs.
	if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(Ops[0], Ops[1]);
	}

	// Finally, try to combine into a single shuffle instruction.
	return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
	DCI, Subtarget);
	}

	/// \brief Get the PSHUF-style mask from PSHUF node.
	///
	/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
	/// PSHUF-style masks that can be reused with such instructions.
	static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	SmallVector<SDValue, 2> Ops;
	bool IsUnary;
	bool HaveMask =
	getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
	(void)HaveMask;
	assert(HaveMask);

	// If we have more than 128-bits, only the low 128-bits of shuffle mask
	// matter. Check that the upper masks are repeats and remove them.
	if (VT.getSizeInBits() > 128) {
	int LaneElts = 128 / VT.getScalarSizeInBits();
	#ifndef NDEBUG
	for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
	for (int j = 0; j < LaneElts; ++j)
	assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
	"Mask doesn't repeat in high 128-bit lanes!");
	#endif
	Mask.resize(LaneElts);
	}

	switch (N.getOpcode()) {
	case X86ISD::PSHUFD:
	return Mask;
	case X86ISD::PSHUFLW:
	Mask.resize(4);
	return Mask;
	case X86ISD::PSHUFHW:
	Mask.erase(Mask.begin(), Mask.begin() + 4);
	for (int &M : Mask)
	M -= 4;
	return Mask;
	default:
	llvm_unreachable("No valid shuffle instruction found!");
	}
	}

	/// \brief Search for a combinable shuffle across a chain ending in pshufd.
	///
	/// We walk up the chain and look for a combinable shuffle, skipping over
	/// shuffles that we could hoist this shuffle's transformation past without
	/// altering anything.
	static SDValue
	combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(N.getOpcode() == X86ISD::PSHUFD &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);

	// Walk up a single-use chain looking for a combinable shuffle. Keep a stack
	// of the shuffles in the chain so that we can form a fresh chain to replace
	// this one.
	SmallVector<SDValue, 8> Chain;
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFD:
	// Found another dword shuffle.
	break;

	case X86ISD::PSHUFLW:
	// Check that the low words (being shuffled) are the identity in the
	// dword shuffle, and the high words are self-contained.
	if (Mask[0] != 0 \|\| Mask[1] != 1 \|\|
	!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::PSHUFHW:
	// Check that the high words (being shuffled) are the identity in the
	// dword shuffle, and the low words are self-contained.
	if (Mask[2] != 2 \|\| Mask[3] != 3 \|\|
	!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
	// shuffle into a preceding word shuffle.
	if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
	V.getSimpleValueType().getVectorElementType() != MVT::i16)
	return SDValue();

	// Search for a half-shuffle which we can combine with.
	unsigned CombineOp =
	V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	if (V.getOperand(0) != V.getOperand(1) \|\|
	!V->isOnlyUserOf(V.getOperand(0).getNode()))
	return SDValue();
	Chain.push_back(V);
	V = V.getOperand(0);
	do {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing to combine.

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOp)
	break;

	Chain.push_back(V);

	LLVM_FALLTHROUGH;
	case ISD::BITCAST:
	V = V.getOperand(0);
	continue;
	}
	break;
	} while (V.hasOneUse());
	break;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return SDValue();

	// Merge this node's mask and our incoming mask.
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Rebuild the chain around this new shuffle.
	while (!Chain.empty()) {
	SDValue W = Chain.pop_back_val();

	if (V.getValueType() != W.getOperand(0).getValueType())
	V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

	switch (W.getOpcode()) {
	default:
	llvm_unreachable("Only PSHUF and UNPCK instructions get here!");

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
	break;

	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
	break;
	}
	}
	if (V.getValueType() != N.getValueType())
	V = DAG.getBitcast(N.getValueType(), V);

	// Return the new chain to replace N.
	return V;
	}

	/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
	/// pshufhw.
	///
	/// We walk up the chain, skipping shuffles of the other half and looking
	/// through shuffles which switch halves trying to find a shuffle of the same
	/// pair of dwords.
	static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert(
	(N.getOpcode() == X86ISD::PSHUFLW \|\| N.getOpcode() == X86ISD::PSHUFHW) &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);
	unsigned CombineOpcode = N.getOpcode();

	// Walk up a single-use chain looking for a combinable shuffle.
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return false; // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOpcode)
	break;

	// Other-half shuffles are no-ops.
	continue;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return false;

	// Combine away the bottom node as its shuffle will be accumulated into
	// a preceding shuffle.
	DCI.CombineTo(N.getNode(), N.getOperand(0), /AddTo/ true);

	// Record the old value.
	SDValue Old = V;

	// Merge this node's mask and our incoming mask (adjusted to account for all
	// the pshufd instructions encountered).
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Check that the shuffles didn't cancel each other out. If not, we need to
	// combine to the new one.
	if (Old != V)
	// Replace the combinable shuffle with the combined one, updating all users
	// so that we re-evaluate the chain here.
	DCI.CombineTo(Old.getNode(), V, /AddTo/ true);

	return true;
	}

	/// \brief Try to combine x86 target specific shuffles.
	static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	unsigned Opcode = N.getOpcode();

	// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
	// single instruction.
	if (VT.getScalarSizeInBits() == 64 &&
	(Opcode == X86ISD::MOVSD \|\| Opcode == X86ISD::UNPCKH \|\|
	Opcode == X86ISD::UNPCKL)) {
	auto BC0 = peekThroughBitcasts(N.getOperand(0));
	auto BC1 = peekThroughBitcasts(N.getOperand(1));
	EVT VT0 = BC0.getValueType();
	EVT VT1 = BC1.getValueType();
	unsigned Opcode0 = BC0.getOpcode();
	unsigned Opcode1 = BC1.getOpcode();
	if (Opcode0 == Opcode1 && VT0 == VT1 &&
	(Opcode0 == X86ISD::FHADD \|\| Opcode0 == X86ISD::HADD \|\|
	Opcode0 == X86ISD::FHSUB \|\| Opcode0 == X86ISD::HSUB \|\|
	Opcode0 == X86ISD::PACKSS \|\| Opcode0 == X86ISD::PACKUS)) {
	SDValue Lo, Hi;
	if (Opcode == X86ISD::MOVSD) {
	Lo = BC1.getOperand(0);
	Hi = BC0.getOperand(1);
	} else {
	Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
	Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
	}
	SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
	DCI.AddToWorklist(Horiz.getNode());
	return DAG.getBitcast(VT, Horiz);
	}
	}

	switch (Opcode) {
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	Mask = getPSHUFShuffleMask(N);
	assert(Mask.size() == 4);
	break;
	case X86ISD::UNPCKL: {
	// Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
	// which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
	// moves upper half elements into the lower half part. For example:
	//
	// t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
	// undef:v16i8
	// t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
	//
	// will be combined to:
	//
	// t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1

	// This is only for 128-bit vectors. From SSE4.1 onward this combine may not
	// happen due to advanced instructions.
	if (!VT.is128BitVector())
	return SDValue();

	auto Op0 = N.getOperand(0);
	auto Op1 = N.getOperand(1);
	if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();

	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 8> ExpectedMask(NumElts, -1);
	std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
	NumElts / 2);

	auto ShufOp = Op1.getOperand(0);
	if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
	}
	return SDValue();
	}
	case X86ISD::BLENDI: {
	SDValue V0 = N->getOperand(0);
	SDValue V1 = N->getOperand(1);
	assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
	"Unexpected input vector types");

	// Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
	// operands and changing the mask to 1. This saves us a bunch of
	// pattern-matching possibilities related to scalar math ops in SSE/AVX.
	// x86InstrInfo knows how to commute this back after instruction selection
	// if it would help register allocation.

	// TODO: If optimizing for size or a processor that doesn't suffer from
	// partial register update stalls, this should be transformed into a MOVSD
	// instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.

	if (VT == MVT::v2f64)
	if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
	if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
	SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
	}

	return SDValue();
	}
	case X86ISD::MOVSD:
	case X86ISD::MOVSS: {
	SDValue V0 = peekThroughBitcasts(N->getOperand(0));
	SDValue V1 = peekThroughBitcasts(N->getOperand(1));
	bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
	bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
	if (isZero0 && isZero1)
	return SDValue();

	// We often lower to MOVSD/MOVSS from integer as well as native float
	// types; remove unnecessary domain-crossing bitcasts if we can to make it
	// easier to combine shuffles later on. We've already accounted for the
	// domain switching cost when we decided to lower with it.
	bool isFloat = VT.isFloatingPoint();
	bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
	bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
	if ((isFloat != isFloat0 \|\| isZero0) && (isFloat != isFloat1 \|\| isZero1)) {
	MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
	: (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
	V0 = DAG.getBitcast(NewVT, V0);
	V1 = DAG.getBitcast(NewVT, V1);
	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
	}

	return SDValue();
	}
	case X86ISD::INSERTPS: {
	assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
	SDValue Op0 = N.getOperand(0);
	SDValue Op1 = N.getOperand(1);
	SDValue Op2 = N.getOperand(2);
	unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
	unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
	unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
	unsigned ZeroMask = InsertPSMask & 0xF;

	// If we zero out all elements from Op0 then we don't need to reference it.
	if (((ZeroMask \| (1u << DstIdx)) == 0xF) && !Op0.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// If we zero out the element from Op1 then we don't need to reference it.
	if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// Attempt to merge insertps Op1 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask1;
	SmallVector<SDValue, 2> Ops1;
	if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
	int M = TargetMask1[SrcIdx];
	if (isUndefOrZero(M)) {
	// Zero/UNDEF insertion - zero out element and remove dependency.
	InsertPSMask \|= (1u << DstIdx);
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}
	// Update insertps mask srcidx and reference the source input directly.
	assert(0 <= M && M < 8 && "Shuffle index out of range");
	InsertPSMask = (InsertPSMask & 0x3f) \| ((M & 0x3) << 6);
	Op1 = Ops1[M < 4 ? 0 : 1];
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	// Attempt to merge insertps Op0 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask0;
	SmallVector<SDValue, 2> Ops0;
	if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
	return SDValue();

	bool Updated = false;
	bool UseInput00 = false;
	bool UseInput01 = false;
	for (int i = 0; i != 4; ++i) {
	int M = TargetMask0[i];
	if ((InsertPSMask & (1u << i)) \|\| (i == (int)DstIdx)) {
	// No change if element is already zero or the inserted element.
	continue;
	} else if (isUndefOrZero(M)) {
	// If the target mask is undef/zero then we must zero the element.
	InsertPSMask \|= (1u << i);
	Updated = true;
	continue;
	}

	// The input vector element must be inline.
	if (M != i && M != (i + 4))
	return SDValue();

	// Determine which inputs of the target shuffle we're using.
	UseInput00 \|= (0 <= M && M < 4);
	UseInput01 \|= (4 <= M);
	}

	// If we're not using both inputs of the target shuffle then use the
	// referenced input directly.
	if (UseInput00 && !UseInput01) {
	Updated = true;
	Op0 = Ops0[0];
	} else if (!UseInput00 && UseInput01) {
	Updated = true;
	Op0 = Ops0[1];
	}

	if (Updated)
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	return SDValue();
	}
	default:
	return SDValue();
	}

	// Nuke no-op shuffles that show up after combining.
	if (isNoopShuffleMask(Mask))
	return DCI.CombineTo(N.getNode(), N.getOperand(0), /AddTo/ true);

	// Look for simplifications involving one or two shuffle instructions.
	SDValue V = N.getOperand(0);
	switch (N.getOpcode()) {
	default:
	break;
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");

	if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
	return SDValue(); // We combined away this shuffle, so we're done.

	// See if this reduces to a PSHUFD which is no more expensive and can
	// combine with more operations. Note that it has to at least flip the
	// dwords as otherwise it would have been removed as a no-op.
	if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
	int DMask[] = {0, 1, 2, 3};
	int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
	DMask[DOffset + 0] = DOffset + 1;
	DMask[DOffset + 1] = DOffset + 0;
	MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
	V = DAG.getBitcast(DVT, V);
	DCI.AddToWorklist(V.getNode());
	V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
	getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
	DCI.AddToWorklist(V.getNode());
	return DAG.getBitcast(VT, V);
	}

	// Look for shuffle patterns which can be implemented as a single unpack.
	// FIXME: This doesn't handle the location of the PSHUFD generically, and
	// only works when we have a PSHUFD followed by two half-shuffles.
	if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
	(V.getOpcode() == X86ISD::PSHUFLW \|\|
	V.getOpcode() == X86ISD::PSHUFHW) &&
	V.getOpcode() != N.getOpcode() &&
	V.hasOneUse()) {
	SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
	if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
	int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int WordMask[8];
	for (int i = 0; i < 4; ++i) {
	WordMask[i + NOffset] = Mask[i] + NOffset;
	WordMask[i + VOffset] = VMask[i] + VOffset;
	}
	// Map the word mask through the DWord mask.
	int MappedMask[8];
	for (int i = 0; i < 8; ++i)
	MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
	if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
	makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
	// We can replace all three shuffles with an unpack.
	V = DAG.getBitcast(VT, D.getOperand(0));
	DCI.AddToWorklist(V.getNode());
	return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
	: X86ISD::UNPCKH,
	DL, VT, V, V);
	}
	}
	}

	break;

	case X86ISD::PSHUFD:
	if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
	return NewN;

	break;
	}

	return SDValue();
	}

	/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
	/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	///
	/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
	/// so it is easier to generically match. We also insert dummy vector shuffle
	/// nodes for the operands which explicitly discard the lanes which are unused
	/// by this operation to try to flow through the rest of the combiner
	/// the fact that they're unused.
	static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
	SDValue &Opnd0, SDValue &Opnd1,
	bool matchSubAdd = false) {

	EVT VT = N->getValueType(0);
	if ((!Subtarget.hasSSE3() \|\| (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
	(!Subtarget.hasAVX() \|\| (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
	(!Subtarget.hasAVX512() \|\| (VT != MVT::v16f32 && VT != MVT::v8f64)))
	return false;

	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
	SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());

	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);

	unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB;
	unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD;

	// We require the first shuffle operand to be the ExpectedOpcode node,
	// and the second to be the NextExpectedOpcode node.
	if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(V1, V2);
	} else if (V1.getOpcode() != ExpectedOpcode \|\| V2.getOpcode() != NextExpectedOpcode)
	return false;

	// If there are other uses of these operations we can't fold them.
	if (!V1->hasOneUse() \|\| !V2->hasOneUse())
	return false;

	// Ensure that both operations have the same operands. Note that we can
	// commute the FADD operands.
	SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
	if ((V2->getOperand(0) != LHS \|\| V2->getOperand(1) != RHS) &&
	(V2->getOperand(0) != RHS \|\| V2->getOperand(1) != LHS))
	return false;

	// We're looking for blends between FADD and FSUB nodes. We insist on these
	// nodes being lined up in a specific expected pattern.
	if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
	8, 25, 10, 27, 12, 29, 14, 31})))
	return false;

	Opnd0 = LHS;
	Opnd1 = RHS;
	return true;
	}

	/// \brief Try to combine a shuffle into a target-specific add-sub or
	/// mul-add-sub node.
	static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1))
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
	return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	/// \brief Try to combine a shuffle into a target-specific
	/// mul-sub-add node.
	static SDValue combineShuffleToFMSubAdd(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true))
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Try to generate X86ISD::FMSUBADD node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
	return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2);

	return SDValue();
	}

	// We are looking for a shuffle where both sources are concatenated with undef
	// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
	// if we can express this as a single-source shuffle, that's preferable.
	static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX2() \|\| !isa<ShuffleVectorSDNode>(N))
	return SDValue();

	EVT VT = N->getValueType(0);

	// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();

	if (VT.getVectorElementType() != MVT::i32 &&
	VT.getVectorElementType() != MVT::i64 &&
	VT.getVectorElementType() != MVT::f32 &&
	VT.getVectorElementType() != MVT::f64)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Check that both sources are concats with undef.
	if (N0.getOpcode() != ISD::CONCAT_VECTORS \|\|
	N1.getOpcode() != ISD::CONCAT_VECTORS \|\| N0.getNumOperands() != 2 \|\|
	N1.getNumOperands() != 2 \|\| !N0.getOperand(1).isUndef() \|\|
	!N1.getOperand(1).isUndef())
	return SDValue();

	// Construct the new shuffle mask. Elements from the first source retain their
	// index, but elements from the second source no longer need to skip an undef.
	SmallVector<int, 8> Mask;
	int NumElts = VT.getVectorNumElements();

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (int Elt : SVOp->getMask())
	Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

	SDLoc DL(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
	N1.getOperand(0));
	return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
	}

	/// Eliminate a redundant shuffle of a horizontal math op.
	static SDValue foldShuffleOfHorizOp(SDNode *N) {
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE \|\| !N->getOperand(1).isUndef())
	return SDValue();

	SDValue HOp = N->getOperand(0);
	if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
	HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
	return SDValue();

	// 128-bit horizontal math instructions are defined to operate on adjacent
	// lanes of each operand as:
	// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
	// ...similarly for v2f64 and v8i16.
	// TODO: 256-bit is not the same because...x86.
	if (HOp.getOperand(0) != HOp.getOperand(1) \|\| HOp.getValueSizeInBits() != 128)
	return SDValue();

	// When the operands of a horizontal math op are identical, the low half of
	// the result is the same as the high half. If the shuffle is also replicating
	// low and high halves, we don't need the shuffle.
	// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
	// but this should be tied to whatever horizontal op matching and shuffle
	// canonicalization are producing.
	if (isTargetShuffleEquivalent(Mask, { 0, 0 }) \|\|
	isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) \|\|
	isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
	return HOp;

	return SDValue();
	}

	static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	// If we have legalized the vector types, look for blends of FADD and FSUB
	// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
	if (TLI.isTypeLegal(VT)) {
	if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
	return AddSub;

	if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG))
	return FMSubAdd;

	if (SDValue HAddSub = foldShuffleOfHorizOp(N))
	return HAddSub;
	}

	// During Type Legalization, when promoting illegal vector types,
	// the backend might introduce new shuffle dag nodes and bitcasts.
	//
	// This code performs the following transformation:
	// fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
	// (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
	//
	// We do this only if both the bitcast and the BINOP dag nodes have
	// one use. Also, perform this transformation only if the new binary
	// operation is legal. This is to avoid introducing dag nodes that
	// potentially need to be further expanded (or custom lowered) into a
	// less optimal sequence of dag nodes.
	if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
	N->getOpcode() == ISD::VECTOR_SHUFFLE &&
	N->getOperand(0).getOpcode() == ISD::BITCAST &&
	N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	SDValue BC0 = N0.getOperand(0);
	EVT SVT = BC0.getValueType();
	unsigned Opcode = BC0.getOpcode();
	unsigned NumElts = VT.getVectorNumElements();

	if (BC0.hasOneUse() && SVT.isVector() &&
	SVT.getVectorNumElements() * 2 == NumElts &&
	TLI.isOperationLegal(Opcode, VT)) {
	bool CanFold = false;
	switch (Opcode) {
	default : break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	// isOperationLegal lies for integer ops on floating point types.
	CanFold = VT.isInteger();
	break;
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	// isOperationLegal lies for floating point ops on integer types.
	CanFold = VT.isFloatingPoint();
	break;
	}

	unsigned SVTNumElts = SVT.getVectorNumElements();
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
	for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) < 0;

	if (CanFold) {
	SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
	SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
	SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
	return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
	}
	}
	}

	// Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
	// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
	// consecutive, non-overlapping, and in the right order.
	SmallVector<SDValue, 16> Elts;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
	Elts.push_back(Elt);
	continue;
	}
	Elts.clear();
	break;
	}

	if (Elts.size() == VT.getVectorNumElements())
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
	return LD;

	// For AVX2, we sometimes want to combine
	// (vector_shuffle <mask> (concat_vectors t1, undef)
	// (concat_vectors t2, undef))
	// Into:
	// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
	// Since the latter can be efficiently lowered with VPERMD/VPERMQ
	if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
	return ShufConcat;

	if (isTargetShuffle(N->getOpcode())) {
	SDValue Op(N, 0);
	if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
	return Shuffle;

	// Try recursively combining arbitrary sequences of x86 shuffle
	// instructions into higher-order shuffles. We do this after combining
	// specific PSHUF instruction sequences into their minimal form so that we
	// can evaluate how many specialized shuffle instructions are involved in
	// a particular chain.
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}
	}

	return SDValue();
	}

	/// Check if a vector extract from a target-specific shuffle of a load can be
	/// folded into a single element load.
	/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
	/// shuffles have been custom lowered so we need to handle those here.
	static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue InVec = N->getOperand(0);
	SDValue EltNo = N->getOperand(1);
	EVT EltVT = N->getValueType(0);

	if (!isa<ConstantSDNode>(EltNo))
	return SDValue();

	EVT OriginalVT = InVec.getValueType();

	// Peek through bitcasts, don't duplicate a load with other uses.
	InVec = peekThroughOneUseBitcasts(InVec);

	EVT CurrentVT = InVec.getValueType();
	if (!CurrentVT.isVector() \|\|
	CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
	return SDValue();

	if (!isTargetShuffle(InVec.getOpcode()))
	return SDValue();

	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 2> ShuffleOps;
	bool UnaryShuffle;
	if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
	ShuffleOps, ShuffleMask, UnaryShuffle))
	return SDValue();

	// Select the input vector, guarding against out of range extract vector.
	unsigned NumElems = CurrentVT.getVectorNumElements();
	int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
	int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];

	if (Idx == SM_SentinelZero)
	return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
	if (Idx == SM_SentinelUndef)
	return DAG.getUNDEF(EltVT);

	assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
	SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
	: ShuffleOps[1];

	// If inputs to shuffle are the same for both ops, then allow 2 uses
	unsigned AllowedUses =
	(ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;

	if (LdNode.getOpcode() == ISD::BITCAST) {
	// Don't duplicate a load with other uses.
	if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
	return SDValue();

	AllowedUses = 1; // only allow 1 load use if we have a bitcast
	LdNode = LdNode.getOperand(0);
	}

	if (!ISD::isNormalLoad(LdNode.getNode()))
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);

	if (!LN0 \|\|!LN0->hasNUsesOfValue(AllowedUses, 0) \|\| LN0->isVolatile())
	return SDValue();

	// If there's a bitcast before the shuffle, check if the load type and
	// alignment is valid.
	unsigned Align = LN0->getAlignment();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
	EltVT.getTypeForEVT(*DAG.getContext()));

	if (NewAlign > Align \|\| !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
	return SDValue();

	// All checks match so transform back to vector_shuffle so that DAG combiner
	// can finish the job
	SDLoc dl(N);

	// Create shuffle node taking into account the case that its a unary shuffle
	SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
	Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
	ShuffleMask);
	Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
	EltNo);
	}

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the illegal vector is scalarized on subtargets that don't have legal
	// vxi1 types.
	static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
	const X86Subtarget &Subtarget) {
	EVT VT = BitCast.getValueType();
	SDValue N0 = BitCast.getOperand(0);
	EVT VecVT = N0->getValueType(0);

	if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
	N0->getOpcode() == ISD::OR) {
	SDValue Op0 = N0->getOperand(0);
	SDValue Op1 = N0->getOperand(1);
	MVT TrunckVT;
	MVT BitcastVT;
	switch (VT.getSimpleVT().SimpleTy) {
	default:
	return SDValue();
	case MVT::v16i1:
	TrunckVT = MVT::i8;
	BitcastVT = MVT::v8i1;
	break;
	case MVT::v32i1:
	TrunckVT = MVT::i16;
	BitcastVT = MVT::v16i1;
	break;
	case MVT::v64i1:
	TrunckVT = MVT::i32;
	BitcastVT = MVT::v32i1;
	break;
	}
	bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
	bool isArg0UndefLeft =
	Op0->getOpcode() == ISD::ZERO_EXTEND \|\| Op0->getOpcode() == ISD::AND;
	bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
	bool isArg1UndefLeft =
	Op1->getOpcode() == ISD::ZERO_EXTEND \|\| Op1->getOpcode() == ISD::AND;
	SDValue OpLeft;
	SDValue OpRight;
	if (isArg0UndefRight && isArg1UndefLeft) {
	OpLeft = Op0;
	OpRight = Op1;
	} else if (isArg1UndefRight && isArg0UndefLeft) {
	OpLeft = Op1;
	OpRight = Op0;
	} else
	return SDValue();
	SDLoc DL(BitCast);
	SDValue Shr = OpLeft->getOperand(0);
	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
	SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
	SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
	SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
	}

	if (!VT.isScalarInteger() \|\| !VecVT.isSimple())
	return SDValue();

	// With AVX512 vxi1 types are legal and we prefer using k-regs.
	// MOVMSK is supported in SSE2 or later.
	if (Subtarget.hasAVX512() \|\| !Subtarget.hasSSE2())
	return SDValue();

	// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
	// v8f64. So all legal 128-bit and 256-bit vectors are covered except for
	// v8i16 and v16i16.
	// For these two cases, we can shuffle the upper element bytes to a
	// consecutive sequence at the start of the vector and treat the results as
	// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
	// for v16i16 this is not the case, because the shuffle is expensive, so we
	// avoid sign-extending to this type entirely.
	// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
	// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
	MVT SExtVT;
	MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	switch (VecVT.getSimpleVT().SimpleTy) {
	default:
	return SDValue();
	case MVT::v2i1:
	SExtVT = MVT::v2i64;
	FPCastVT = MVT::v2f64;
	break;
	case MVT::v4i1:
	SExtVT = MVT::v4i32;
	FPCastVT = MVT::v4f32;
	// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
	// sign-extend to a 256-bit operation to avoid truncation.
	if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
	N0->getOperand(0).getValueType().is256BitVector()) {
	SExtVT = MVT::v4i64;
	FPCastVT = MVT::v4f64;
	}
	break;
	case MVT::v8i1:
	SExtVT = MVT::v8i16;
	// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
	// sign-extend to a 256-bit operation to match the compare.
	// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
	// 256-bit because the shuffle is cheaper than sign extending the result of
	// the compare.
	if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
	(N0->getOperand(0).getValueType().is256BitVector() \|\|
	N0->getOperand(0).getValueType().is512BitVector())) {
	SExtVT = MVT::v8i32;
	FPCastVT = MVT::v8f32;
	}
	break;
	case MVT::v16i1:
	SExtVT = MVT::v16i8;
	// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
	// it is not profitable to sign-extend to 256-bit because this will
	// require an extra cross-lane shuffle which is more expensive than
	// truncating the result of the compare to 128-bits.
	break;
	case MVT::v32i1:
	SExtVT = MVT::v32i8;
	break;
	};

	SDLoc DL(BitCast);
	SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);

	if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
	// Handle pre-AVX2 cases by splitting to two v16i1's.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
	SDValue Lo = extract128BitVector(V, 0, DAG, DL);
	SDValue Hi = extract128BitVector(V, 16, DAG, DL);
	Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
	Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
	Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
	DAG.getConstant(16, DL, ShiftTy));
	V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
	return DAG.getZExtOrTrunc(V, DL, VT);
	}

	if (SExtVT == MVT::v8i16) {
	assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
	V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
	DAG.getUNDEF(MVT::v8i16));
	} else
	assert(SExtVT.getScalarType() != MVT::i16 &&
	"Vectors of i16 must be packed");
	if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
	V = DAG.getBitcast(FPCastVT, V);
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	return DAG.getZExtOrTrunc(V, DL, VT);
	}

	static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the setcc result is scalarized on subtargets that don't have legal
	// vxi1 types.
	if (DCI.isBeforeLegalize()) {
	if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
	return V;

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((VT == MVT::v4i1 \|\| VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
	Subtarget.hasVLX()) {
	SDLoc dl(N);
	N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
	N0 = DAG.getBitcast(MVT::v8i1, N0);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
	DAG.getIntPtrConstant(0, dl));
	}

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((SrcVT == MVT::v4i1 \|\| SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
	Subtarget.hasVLX()) {
	SDLoc dl(N);
	unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
	Ops[0] = N0;
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	N0 = DAG.getBitcast(MVT::i8, N0);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
	}
	}

	// Since MMX types are special and don't usually play with other vector types,
	// it's better to handle them early to be sure we emit efficient code by
	// avoiding store-load conversions.

	// Detect bitcasts between i32 to x86mmx low word.
	if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
	SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0->getOperand(0);
	if (N00.getValueType() == MVT::i32)
	return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
	}

	// Detect bitcasts between element or subvector extraction to x86mmx.
	if (VT == MVT::x86mmx &&
	(N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
	isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0->getOperand(0);
	if (N00.getValueType().is128BitVector())
	return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
	DAG.getBitcast(MVT::v2i64, N00));
	}

	// Detect bitcasts from FP_TO_SINT to x86mmx.
	if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
	N0.getOpcode() == ISD::FP_TO_SINT) {
	SDLoc DL(N0);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
	DAG.getBitcast(MVT::v2i64, Res));
	}

	// Convert a bitcasted integer logic operation that has one bitcasted
	// floating-point operand into a floating-point logic operation. This may
	// create a load of a constant, but that is cheaper than materializing the
	// constant in an integer register and transferring it to an SSE register or
	// transferring the SSE operand to integer register and back.
	unsigned FPOpcode;
	switch (N0.getOpcode()) {
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	default: return SDValue();
	}

	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64)))
	return SDValue();

	SDValue LogicOp0 = N0.getOperand(0);
	SDValue LogicOp1 = N0.getOperand(1);
	SDLoc DL0(N0);

	// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
	if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
	LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
	SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
	}
	// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
	if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
	LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
	SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
	}

	return SDValue();
	}

	// Match a binop + shuffle pyramid that represents a horizontal reduction over
	// the elements of a vector.
	// Returns the vector that is being reduced on, or SDValue() if a reduction
	// was not matched.
	static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
	ArrayRef<ISD::NodeType> CandidateBinOps) {
	// The pattern must end in an extract from index 0.
	if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) \|\|
	!isNullConstant(Extract->getOperand(1)))
	return SDValue();

	SDValue Op = Extract->getOperand(0);
	unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());

	// Match against one of the candidate binary ops.
	if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
	return Op.getOpcode() == unsigned(BinOp);
	}))
	return SDValue();

	// At each stage, we're looking for something that looks like:
	// %s = shufflevector <8 x i32> %op, <8 x i32> undef,
	// <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
	// i32 undef, i32 undef, i32 undef, i32 undef>
	// %a = binop <8 x i32> %op, %s
	// Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
	// we expect something like:
	// <4,5,6,7,u,u,u,u>
	// <2,3,u,u,u,u,u,u>
	// <1,u,u,u,u,u,u,u>
	unsigned CandidateBinOp = Op.getOpcode();
	for (unsigned i = 0; i < Stages; ++i) {
	if (Op.getOpcode() != CandidateBinOp)
	return SDValue();

	ShuffleVectorSDNode *Shuffle =
	dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
	if (Shuffle) {
	Op = Op.getOperand(1);
	} else {
	Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
	Op = Op.getOperand(0);
	}

	// The first operand of the shuffle should be the same as the other operand
	// of the binop.
	if (!Shuffle \|\| Shuffle->getOperand(0) != Op)
	return SDValue();

	// Verify the shuffle has the expected (at this stage of the pyramid) mask.
	for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
	if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
	return SDValue();
	}

	BinOp = CandidateBinOp;
	return Op;
	}

	// Given a select, detect the following pattern:
	// 1: %2 = zext <N x i8> %0 to <N x i32>
	// 2: %3 = zext <N x i8> %1 to <N x i32>
	// 3: %4 = sub nsw <N x i32> %2, %3
	// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
	// 5: %6 = sub nsw <N x i32> zeroinitializer, %4
	// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
	// This is useful as it is the input into a SAD pattern.
	static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
	SDValue &Op1) {
	// Check the condition of the select instruction is greater-than.
	SDValue SetCC = Select->getOperand(0);
	if (SetCC.getOpcode() != ISD::SETCC)
	return false;
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
	if (CC != ISD::SETGT && CC != ISD::SETLT)
	return false;

	SDValue SelectOp1 = Select->getOperand(1);
	SDValue SelectOp2 = Select->getOperand(2);

	// The following instructions assume SelectOp1 is the subtraction operand
	// and SelectOp2 is the negation operand.
	// In the case of SETLT this is the other way around.
	if (CC == ISD::SETLT)
	std::swap(SelectOp1, SelectOp2);

	// The second operand of the select should be the negation of the first
	// operand, which is implemented as 0 - SelectOp1.
	if (!(SelectOp2.getOpcode() == ISD::SUB &&
	ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
	SelectOp2.getOperand(1) == SelectOp1))
	return false;

	// The first operand of SetCC is the first operand of the select, which is the
	// difference between the two input vectors.
	if (SetCC.getOperand(0) != SelectOp1)
	return false;

	// In SetLT case, The second operand of the comparison can be either 1 or 0.
	APInt SplatVal;
	if ((CC == ISD::SETLT) &&
	!((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
	SplatVal.isOneValue()) \|\|
	(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
	return false;

	// In SetGT case, The second operand of the comparison can be either -1 or 0.
	if ((CC == ISD::SETGT) &&
	!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) \|\|
	ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
	return false;

	// The first operand of the select is the difference between the two input
	// vectors.
	if (SelectOp1.getOpcode() != ISD::SUB)
	return false;

	Op0 = SelectOp1.getOperand(0);
	Op1 = SelectOp1.getOperand(1);

	// Check if the operands of the sub are zero-extended from vectors of i8.
	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 \|\|
	Op1.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
	return false;

	return true;
	}

	// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
	// to these zexts.
	static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
	const SDValue &Zext1, const SDLoc &DL) {

	// Find the appropriate width for the PSADBW.
	EVT InVT = Zext0.getOperand(0).getValueType();
	unsigned RegSize = std::max(128u, InVT.getSizeInBits());

	// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
	// fill in the missing vector elements with 0.
	unsigned NumConcat = RegSize / InVT.getSizeInBits();
	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
	Ops[0] = Zext0.getOperand(0);
	MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
	SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
	Ops[0] = Zext1.getOperand(0);
	SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

	// Actually build the SAD
	MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
	return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
	}

	// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
	// PHMINPOSUW.
	static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE41.
	if (!Subtarget.hasSSE41())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
	return SDValue();

	// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
	unsigned BinOp;
	SDValue Src = matchBinOpReduction(
	Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
	if (!Src)
	return SDValue();

	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getScalarType();
	if (SrcSVT != ExtractVT \|\| (SrcVT.getSizeInBits() % 128) != 0)
	return SDValue();

	SDLoc DL(Extract);
	SDValue MinPos = Src;

	// First, reduce the source down to 128-bit, applying BinOp to lo/hi.
	while (SrcVT.getSizeInBits() > 128) {
	unsigned NumElts = SrcVT.getVectorNumElements();
	unsigned NumSubElts = NumElts / 2;
	SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
	unsigned SubSizeInBits = SrcVT.getSizeInBits();
	SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
	SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
	MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
	}
	assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) \|\|
	(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
	"Unexpected value type");

	// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
	// to flip the value accordingly.
	SDValue Mask;
	unsigned MaskEltsBits = ExtractVT.getSizeInBits();
	if (BinOp == ISD::SMAX)
	Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::SMIN)
	Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::UMAX)
	Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	// For v16i8 cases we need to perform UMIN on pairs of byte elements,
	// shuffling each upper element down and insert zeros. This means that the
	// v16i8 UMIN will leave the upper element as zero, performing zero-extension
	// ready for the PHMINPOS.
	if (ExtractVT == MVT::i8) {
	SDValue Upper = DAG.getVectorShuffle(
	SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
	{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
	MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
	}

	// Perform the PHMINPOS on a v8i16 vector,
	MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
	MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
	MinPos = DAG.getBitcast(SrcVT, MinPos);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
	DAG.getIntPtrConstant(0, DL));
	}

	// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
	static SDValue combineHorizontalPredicateResult(SDNode *Extract,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE2 or with AVX512VL (which uses predicate registers).
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasVLX())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	unsigned BitWidth = ExtractVT.getSizeInBits();
	if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
	ExtractVT != MVT::i8)
	return SDValue();

	// Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
	unsigned BinOp = 0;
	SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
	if (!Match)
	return SDValue();

	// EXTRACT_VECTOR_ELT can require implicit extension of the vector element
	// which we can't support here for now.
	if (Match.getScalarValueSizeInBits() != BitWidth)
	return SDValue();

	// We require AVX2 for PMOVMSKB for v16i16/v32i8;
	unsigned MatchSizeInBits = Match.getValueSizeInBits();
	if (!(MatchSizeInBits == 128 \|\|
	(MatchSizeInBits == 256 &&
	((Subtarget.hasAVX() && BitWidth >= 32) \|\| Subtarget.hasAVX2()))))
	return SDValue();

	// Don't bother performing this for 2-element vectors.
	if (Match.getValueType().getVectorNumElements() <= 2)
	return SDValue();

	// Check that we are extracting a reduction of all sign bits.
	if (DAG.ComputeNumSignBits(Match) != BitWidth)
	return SDValue();

	// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
	MVT MaskVT;
	if (64 == BitWidth \|\| 32 == BitWidth)
	MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
	MatchSizeInBits / BitWidth);
	else
	MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

	APInt CompareBits;
	ISD::CondCode CondCode;
	if (BinOp == ISD::OR) {
	// any_of -> MOVMSK != 0
	CompareBits = APInt::getNullValue(32);
	CondCode = ISD::CondCode::SETNE;
	} else {
	// all_of -> MOVMSK == ((1 << NumElts) - 1)
	CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
	CondCode = ISD::CondCode::SETEQ;
	}

	// Perform the select as i32/i64 and then truncate to avoid partial register
	// stalls.
	unsigned ResWidth = std::max(BitWidth, 32u);
	EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
	SDLoc DL(Extract);
	SDValue Zero = DAG.getConstant(0, DL, ResVT);
	SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
	SDValue Res = DAG.getBitcast(MaskVT, Match);
	Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
	Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
	Ones, Zero, CondCode);
	return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
	}

	static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// PSADBW is only supported on SSE2 and up.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Verify the type we're extracting from is any integer type above i16.
	EVT VT = Extract->getOperand(0).getValueType();
	if (!VT.isSimple() \|\| !(VT.getVectorElementType().getSizeInBits() > 16))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;

	// We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (RegSize / VT.getVectorNumElements() < 8)
	return SDValue();

	// Match shuffle + add pyramid.
	unsigned BinOp = 0;
	SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});

	// The operand is expected to be zero extended from i8
	// (verified in detectZextAbsDiff).
	// In order to convert to i64 and above, additional any/zero/sign
	// extend is expected.
	// The zero extend from 32 bit has no mathematical effect on the result.
	// Also the sign extend is basically zero extend
	// (extends the sign bit which is zero).
	// So it is correct to skip the sign/zero extend instruction.
	if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND \|\|
	Root.getOpcode() == ISD::ZERO_EXTEND \|\|
	Root.getOpcode() == ISD::ANY_EXTEND))
	Root = Root.getOperand(0);

	// If there was a match, we want Root to be a select that is the root of an
	// abs-diff pattern.
	if (!Root \|\| (Root.getOpcode() != ISD::VSELECT))
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	SDValue Zext0, Zext1;
	if (!detectZextAbsDiff(Root, Zext0, Zext1))
	return SDValue();

	// Create the SAD instruction.
	SDLoc DL(Extract);
	SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);

	// If the original vector was wider than 8 elements, sum over the results
	// in the SAD vector.
	unsigned Stages = Log2_32(VT.getVectorNumElements());
	MVT SadVT = SAD.getSimpleValueType();
	if (Stages > 3) {
	unsigned SadElems = SadVT.getVectorNumElements();

	for(unsigned i = Stages - 3; i > 0; --i) {
	SmallVector<int, 16> Mask(SadElems, -1);
	for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
	Mask[j] = MaskEnd + j;

	SDValue Shuffle =
	DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
	SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
	}
	}

	MVT Type = Extract->getSimpleValueType(0);
	unsigned TypeSizeInBits = Type.getSizeInBits();
	// Return the lowest TypeSizeInBits bits.
	MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
	SAD = DAG.getBitcast(ResVT, SAD);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
	Extract->getOperand(1));
	}

	// Attempt to peek through a target shuffle and extract the scalar from the
	// source.
	static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue Src = N->getOperand(0);
	SDValue Idx = N->getOperand(1);

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();

	// Don't attempt this for boolean mask vectors or unknown extraction indices.
	if (SrcSVT == MVT::i1 \|\| !isa<ConstantSDNode>(Idx))
	return SDValue();

	// Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
	if (X86ISD::VBROADCAST == Src.getOpcode() &&
	Src.getOperand(0).getValueType() == VT)
	return Src.getOperand(0);

	// Resolve the target shuffle inputs and mask.
	SmallVector<int, 16> Mask;
	SmallVector<SDValue, 2> Ops;
	if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
	return SDValue();

	// Attempt to narrow/widen the shuffle mask to the correct size.
	if (Mask.size() != NumSrcElts) {
	if ((NumSrcElts % Mask.size()) == 0) {
	SmallVector<int, 16> ScaledMask;
	int Scale = NumSrcElts / Mask.size();
	scaleShuffleMask<int>(Scale, Mask, ScaledMask);
	Mask = std::move(ScaledMask);
	} else if ((Mask.size() % NumSrcElts) == 0) {
	SmallVector<int, 16> WidenedMask;
	while (Mask.size() > NumSrcElts &&
	canWidenShuffleElements(Mask, WidenedMask))
	Mask = std::move(WidenedMask);
	// TODO - investigate support for wider shuffle masks with known upper
	// undef/zero elements for implicit zero-extension.
	}
	}

	// Check if narrowing/widening failed.
	if (Mask.size() != NumSrcElts)
	return SDValue();

	int SrcIdx = Mask[N->getConstantOperandVal(1)];
	SDLoc dl(N);

	// If the shuffle source element is undef/zero then we can just accept it.
	if (SrcIdx == SM_SentinelUndef)
	return DAG.getUNDEF(VT);

	if (SrcIdx == SM_SentinelZero)
	return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
	: DAG.getConstant(0, dl, VT);

	SDValue SrcOp = Ops[SrcIdx / Mask.size()];
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	SrcIdx = SrcIdx % Mask.size();

	// We can only extract other elements from 128-bit vectors and in certain
	// circumstances, depending on SSE-level.
	// TODO: Investigate using extract_subvector for larger vectors.
	// TODO: Investigate float/double extraction if it will be just stored.
	if ((SrcVT == MVT::v4i32 \|\| SrcVT == MVT::v2i64) &&
	((SrcIdx == 0 && Subtarget.hasSSE2()) \|\| Subtarget.hasSSE41())) {
	assert(SrcSVT == VT && "Unexpected extraction type");
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	}

	if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
	assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
	"Unexpected extraction type");
	unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
	SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	return DAG.getZExtOrTrunc(ExtOp, dl, VT);
	}

	return SDValue();
	}

	/// Detect vector gather/scatter index generation and convert it from being a
	/// bunch of shuffles and extracts into a somewhat faster sequence.
	/// For i686, the best sequence is apparently storing the value and loading
	/// scalars back, while for x64 we should use 64-bit extracts and shifts.
	static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
	return NewOp;

	// TODO - Remove this once we can handle the implicit zero-extension of
	// X86ISD::PEXTRW/X86ISD::PEXTRB in:
	// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
	// combineBasicSADPattern.
	if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
	return NewOp;

	SDValue InputVector = N->getOperand(0);
	SDValue EltIdx = N->getOperand(1);

	EVT SrcVT = InputVector.getValueType();
	EVT VT = N->getValueType(0);
	SDLoc dl(InputVector);

	// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getBitcast(VT, InputVector);
	}

	// Detect mmx to i32 conversion through a v2i32 elt extract.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
	}

	if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
	isa<ConstantSDNode>(EltIdx) &&
	isa<ConstantSDNode>(InputVector.getOperand(0))) {
	uint64_t ExtractedElt = N->getConstantOperandVal(1);
	uint64_t InputValue = InputVector.getConstantOperandVal(0);
	uint64_t Res = (InputValue >> ExtractedElt) & 1;
	return DAG.getConstant(Res, dl, MVT::i1);
	}

	// Check whether this extract is the root of a sum of absolute differences
	// pattern. This has to be done here because we really want it to happen
	// pre-legalization,
	if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
	return SAD;

	// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
	if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
	return Cmp;

	// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
	if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
	return MinMax;

	// Only operate on vectors of 4 elements, where the alternative shuffling
	// gets to be more expensive.
	if (SrcVT != MVT::v4i32)
	return SDValue();

	// Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
	// single use which is a sign-extend or zero-extend, and all elements are
	// used.
	SmallVector<SDNode *, 4> Uses;
	unsigned ExtractedElements = 0;
	for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
	UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
	if (UI.getUse().getResNo() != InputVector.getResNo())
	return SDValue();

	SDNode Extract = UI;
	if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	if (Extract->getValueType(0) != MVT::i32)
	return SDValue();
	if (!Extract->hasOneUse())
	return SDValue();
	if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
	Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();
	if (!isa<ConstantSDNode>(Extract->getOperand(1)))
	return SDValue();

	// Record which element was extracted.
	ExtractedElements \|= 1 << Extract->getConstantOperandVal(1);
	Uses.push_back(Extract);
	}

	// If not all the elements were used, this may not be worthwhile.
	if (ExtractedElements != 15)
	return SDValue();

	// Ok, we've now decided to do the transformation.
	// If 64-bit shifts are legal, use the extract-shift sequence,
	// otherwise bounce the vector off the cache.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Vals[4];

	if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
	SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
	auto &DL = DAG.getDataLayout();
	EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
	SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
	DAG.getConstant(0, dl, VecIdxTy));
	SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
	DAG.getConstant(1, dl, VecIdxTy));

	SDValue ShAmt = DAG.getConstant(
	32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
	Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
	Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
	DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
	Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
	Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
	DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
	} else {
	// Store the value to a temporary stack slot.
	SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
	SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
	MachinePointerInfo());

	EVT ElementType = SrcVT.getVectorElementType();
	unsigned EltSize = ElementType.getSizeInBits() / 8;

	// Replace each use (extract) with a load of the appropriate element.
	for (unsigned i = 0; i < 4; ++i) {
	uint64_t Offset = EltSize * i;
	auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);

	SDValue ScalarAddr =
	DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);

	// Load the scalar.
	Vals[i] =
	DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
	}
	}

	// Replace the extracts
	for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
	UE = Uses.end(); UI != UE; ++UI) {
	SDNode Extract = UI;

	uint64_t IdxVal = Extract->getConstantOperandVal(1);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
	}

	// The replacement was made in place; return N so it won't be revisited.
	return SDValue(N, 0);
	}

	/// If a vector select has an operand that is -1 or 0, try to simplify the
	/// select to a bitwise logic operation.
	/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
	static SDValue
	combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	SDLoc DL(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (N->getOpcode() != ISD::VSELECT)
	return SDValue();

	assert(CondVT.isVector() && "Vector select expects a vector selector!");

	bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
	// Check if the first operand is all zeros and Cond type is vXi1.
	// This situation only applies to avx512.
	if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
	CondVT.getVectorElementType() == MVT::i1) {
	// Invert the cond to not(cond) : xor(op,allones)=not(op)
	SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
	DAG.getAllOnesConstant(DL, CondVT));
	// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
	return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
	}

	// To use the condition operand as a bitwise mask, it must have elements that
	// are the same size as the select elements. Ie, the condition operand must
	// have already been promoted from the IR select condition type <N x i1>.
	// Don't check if the types themselves are equal because that excludes
	// vector floating-point selects.
	if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
	bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

	// Try to invert the condition if true value is not all 1s and false value is
	// not all 0s.
	if (!TValIsAllOnes && !FValIsAllZeros &&
	// Check if the selector will be produced by CMPP/PCMP.
	Cond.getOpcode() == ISD::SETCC &&
	// Check if SETCC has already been promoted.
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
	CondVT) {
	bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

	if (TValIsAllZeros \|\| FValIsAllOnes) {
	SDValue CC = Cond.getOperand(2);
	ISD::CondCode NewCC =
	ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
	Cond.getOperand(0).getValueType().isInteger());
	Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
	NewCC);
	std::swap(LHS, RHS);
	TValIsAllOnes = FValIsAllOnes;
	FValIsAllZeros = TValIsAllZeros;
	}
	}

	// Cond value must be 'sign splat' to be converted to a logical op.
	if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
	return SDValue();

	// vselect Cond, 111..., 000... -> Cond
	if (TValIsAllOnes && FValIsAllZeros)
	return DAG.getBitcast(VT, Cond);

	if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
	return SDValue();

	// vselect Cond, 111..., X -> or Cond, X
	if (TValIsAllOnes) {
	SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
	SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
	return DAG.getBitcast(VT, Or);
	}

	// vselect Cond, X, 000... -> and Cond, X
	if (FValIsAllZeros) {
	SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
	SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
	return DAG.getBitcast(VT, And);
	}

	// vselect Cond, 000..., X -> andn Cond, X
	if (TValIsAllZeros) {
	MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
	SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
	SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
	SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
	return DAG.getBitcast(VT, AndN);
	}

	return SDValue();
	}

	static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	SDLoc DL(N);

	auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
	auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
	if (!TrueC \|\| !FalseC)
	return SDValue();

	// Don't do this for crazy integer types.
	EVT VT = N->getValueType(0);
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// We're going to use the condition bit in math or logic ops. We could allow
	// this with a wider condition value (post-legalization it becomes an i8),
	// but if nothing is creating selects that late, it doesn't matter.
	if (Cond.getValueType() != MVT::i1)
	return SDValue();

	// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
	// 3, 5, or 9 with i32/i64, so those get transformed too.
	// TODO: For constants that overflow or do not differ by power-of-2 or small
	// multiplier, convert to 'and' + 'add'.
	const APInt &TrueVal = TrueC->getAPIntValue();
	const APInt &FalseVal = FalseC->getAPIntValue();
	bool OV;
	APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
	if (OV)
	return SDValue();

	APInt AbsDiff = Diff.abs();
	if (AbsDiff.isPowerOf2() \|\|
	((VT == MVT::i32 \|\| VT == MVT::i64) &&
	(AbsDiff == 3 \|\| AbsDiff == 5 \|\| AbsDiff == 9))) {

	// We need a positive multiplier constant for shift/LEA codegen. The 'not'
	// of the condition can usually be folded into a compare predicate, but even
	// without that, the sequence should be cheaper than a CMOV alternative.
	if (TrueVal.slt(FalseVal)) {
	Cond = DAG.getNOT(DL, Cond, MVT::i1);
	std::swap(TrueC, FalseC);
	}

	// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
	SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

	// Multiply condition by the difference if non-one.
	if (!AbsDiff.isOneValue())
	R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

	// Add the base if non-zero.
	if (!FalseC->isNullValue())
	R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

	return R;
	}

	return SDValue();
	}

	// If this is a bitcasted op that can be represented as another type, push the
	// the bitcast to the inputs. This allows more opportunities for pattern
	// matching masked instructions. This is called when we know that the operation
	// is used as one of the inputs of a vselect.
	static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// Make sure we have a bitcast.
	if (OrigOp.getOpcode() != ISD::BITCAST)
	return false;

	SDValue Op = OrigOp.getOperand(0);

	// If the operation is used by anything other than the bitcast, we shouldn't
	// do this combine as that would replicate the operation.
	if (!Op.hasOneUse())
	return false;

	MVT VT = OrigOp.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	SDLoc DL(Op.getNode());

	auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
	SDValue Op2) {
	Op0 = DAG.getBitcast(VT, Op0);
	DCI.AddToWorklist(Op0.getNode());
	Op1 = DAG.getBitcast(VT, Op1);
	DCI.AddToWorklist(Op1.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
	return true;
	};

	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::SHUF128: {
	if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
	return false;
	// Only change element size, not type.
	if (VT.isInteger() != Op.getSimpleValueType().isInteger())
	return false;
	return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
	Op.getOperand(2));
	}
	case X86ISD::SUBV_BROADCAST: {
	unsigned EltSize = EltVT.getSizeInBits();
	if (EltSize != 32 && EltSize != 64)
	return false;
	// Only change element size, not type.
	if (VT.isInteger() != Op.getSimpleValueType().isInteger())
	return false;
	SDValue Op0 = Op.getOperand(0);
	MVT Op0VT = MVT::getVectorVT(EltVT,
	Op0.getSimpleValueType().getSizeInBits() / EltSize);
	Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
	DCI.AddToWorklist(Op0.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0));
	return true;
	}
	}

	return false;
	}

	/// Do target-specific dag combines on SELECT and VSELECT nodes.
	static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue Cond = N->getOperand(0);
	// Get the LHS/RHS of the select.
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// If we have SSE[12] support, try to form min/max nodes. SSE min/max
	// instructions match the semantics of the common C idiom x<y?x:y but not
	// x<=y?x:y, because of how they handle negative zero (which can be
	// ignored in unsafe-math mode).
	// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
	if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
	VT != MVT::f80 && VT != MVT::f128 &&
	(TLI.isTypeLegal(VT) \|\| VT == MVT::v2f32) &&
	(Subtarget.hasSSE2() \|\|
	(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	unsigned Opcode = 0;
	// Check for x CC y ? x : y.
	if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	switch (CC) {
	default: break;
	case ISD::SETULT:
	// Converting this to a min would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETOLE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETULE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETOGE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGT:
	// Converting this to a max would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMAX;
	break;
	}
	// Check for x CC y ? y : x -- a min/max with reversed arms.
	} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(0))) {
	switch (CC) {
	default: break;
	case ISD::SETOGE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS))) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGT:
	// Converting this to a min would handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	(!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETULT:
	// Converting this to a max would handle NaNs incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETOLE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETULE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMAX;
	break;
	}
	}

	if (Opcode)
	return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
	}

	// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
	// lowering on KNL. In this case we convert it to
	// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
	// The same situation for all 128 and 256-bit vectors of i8 and i16.
	// Since SKX these selects have a proper lowering.
	if (Subtarget.hasAVX512() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1 &&
	(VT.is128BitVector() \|\| VT.is256BitVector()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16) &&
	!(Subtarget.hasBWI() && Subtarget.hasVLX())) {
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	DCI.AddToWorklist(Cond.getNode());
	return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
	}

	if (SDValue V = combineSelectOfTwoConstants(N, DAG))
	return V;

	// Canonicalize max and min:
	// (x > y) ? x : y -> (x >= y) ? x : y
	// (x < y) ? x : y -> (x <= y) ? x : y
	// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
	// the need for an extra compare
	// against zero. e.g.
	// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
	// subl %esi, %edi
	// testl %edi, %edi
	// movl $0, %eax
	// cmovgl %edi, %eax
	// =>
	// xorl %eax, %eax
	// subl %esi, $edi
	// cmovsl %eax, %edi
	if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
	DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	switch (CC) {
	default: break;
	case ISD::SETLT:
	case ISD::SETGT: {
	ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
	Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
	Cond.getOperand(0), Cond.getOperand(1), NewCC);
	return DAG.getSelect(DL, VT, Cond, LHS, RHS);
	}
	}
	}

	// Early exit check
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	// Match VSELECTs into subs with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
	((Subtarget.hasSSE2() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16)) \|\|
	(Subtarget.hasAVX2() && (VT == MVT::v32i8 \|\| VT == MVT::v16i16)))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Check if one of the arms of the VSELECT is a zero vector. If it's on the
	// left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, true);
	} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other->getNumOperands() == 2 &&
	DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
	SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
	SDValue CondRHS = Cond->getOperand(1);

	// Look for a general sub with unsigned saturation first.
	// x >= y ? x-y : 0 --> subus x, y
	// x > y ? x-y : 0 --> subus x, y
	if ((CC == ISD::SETUGE \|\| CC == ISD::SETUGT) &&
	Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
	return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);

	if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
	if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
	if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
	if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > C-1 ? x+-C : 0 --> subus x, C
	if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
	CondRHSConst->getAPIntValue() ==
	(-OpRHSConst->getAPIntValue() - 1))
	return DAG.getNode(
	X86ISD::SUBUS, DL, VT, OpLHS,
	DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));

	// Another special case: If C was a sign bit, the sub has been
	// canonicalized into a xor.
	// FIXME: Would it be better to use computeKnownBits to determine
	// whether it's safe to decanonicalize the xor?
	// x s< 0 ? x^C : 0 --> subus x, C
	if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
	OpRHSConst->getAPIntValue().isSignMask())
	// Note that we have to rebuild the RHS constant here to ensure we
	// don't rely on particular values of undef lanes.
	return DAG.getNode(
	X86ISD::SUBUS, DL, VT, OpLHS,
	DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
	}
	}
	}

	if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
	return V;

	// If this is a dynamic select (non-constant condition) and we can match
	// this node with one of the variable blend instructions, restructure the
	// condition so that blends can use the high (sign) bit of each element and
	// use SimplifyDemandedBits to simplify the condition operand.
	if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
	!DCI.isBeforeLegalize() &&
	!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
	unsigned BitWidth = Cond.getScalarValueSizeInBits();

	// Don't optimize vector selects that map to mask-registers.
	if (BitWidth == 1)
	return SDValue();

	// We can only handle the cases where VSELECT is directly legal on the
	// subtarget. We custom lower VSELECT nodes with constant conditions and
	// this makes it hard to see whether a dynamic VSELECT will correctly
	// lower, so we both check the operation's status and explicitly handle the
	// cases where a dynamic blend will fail even though a constant-condition
	// blend could be custom lowered.
	// FIXME: We should find a better way to handle this class of problems.
	// Potentially, we should combine constant-condition vselect nodes
	// pre-legalization into shuffles and not mark as many types as custom
	// lowered.
	if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
	return SDValue();
	// FIXME: We don't support i16-element blends currently. We could and
	// should support them by making all the bits in the condition be set
	// rather than just the high bit and using an i8-element blend.
	if (VT.getVectorElementType() == MVT::i16)
	return SDValue();
	// Dynamic blending was only available from SSE4.1 onward.
	if (VT.is128BitVector() && !Subtarget.hasSSE41())
	return SDValue();
	// Byte blends are only available in AVX2
	if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
	return SDValue();
	// There are no 512-bit blend instructions that use sign bits.
	if (VT.is512BitVector())
	return SDValue();

	assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
	APInt DemandedMask(APInt::getSignMask(BitWidth));
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) \|\|
	TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
	// If we changed the computation somewhere in the DAG, this change will
	// affect all users of Cond. Make sure it is fine and update all the nodes
	// so that we do not use the generic VSELECT anymore. Otherwise, we may
	// perform wrong optimizations as we messed with the actual expectation
	// for the vector boolean values.
	if (Cond != TLO.Old) {
	// Check all uses of the condition operand to check whether it will be
	// consumed by non-BLEND instructions. Those may require that all bits
	// are set properly.
	for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
	UI != UE; ++UI) {
	// TODO: Add other opcodes eventually lowered into BLEND.
	if (UI->getOpcode() != ISD::VSELECT \|\| UI.getOperandNo() != 0)
	return SDValue();
	}

	// Update all users of the condition before committing the change, so
	// that the VSELECT optimizations that expect the correct vector boolean
	// value will not be triggered.
	for (SDNode *U : Cond->uses()) {
	SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
	U->getValueType(0), Cond, U->getOperand(1),
	U->getOperand(2));
	DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
	}
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue();
	}
	// Only Cond (rather than other nodes in the computation chain) was
	// changed. Change the condition just for N to keep the opportunity to
	// optimize all other users their own way.
	SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
	return SDValue();
	}
	}

	// Look for vselects with LHS/RHS being bitcasted from an operation that
	// can be executed on another type. Push the bitcast to the inputs of
	// the operation. This exposes opportunities for using masking instructions.
	if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
	CondVT.getVectorElementType() == MVT::i1) {
	if (combineBitcastForMaskedOp(LHS, DAG, DCI))
	return SDValue(N, 0);
	if (combineBitcastForMaskedOp(RHS, DAG, DCI))
	return SDValue(N, 0);
	}

	// Custom action for SELECT MMX
	if (VT == MVT::x86mmx) {
	LHS = DAG.getBitcast(MVT::i64, LHS);
	RHS = DAG.getBitcast(MVT::i64, RHS);
	SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
	return DAG.getBitcast(VT, newSelect);
	}

	return SDValue();
	}

	/// Combine:
	/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
	/// to:
	/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
	/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
	/// Note that this is only legal for some op/cc combinations.
	static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Can't replace the cmp if it has more uses than the one we're looking at.
	// FIXME: We would like to be able to handle this, but would need to make sure
	// all uses were updated.
	if (!Cmp.hasOneUse())
	return SDValue();

	// This only applies to variations of the common case:
	// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
	// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
	// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
	// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
	// Using the proper condcodes (see below), overflow is checked for.

	// FIXME: We can generalize both constraints:
	// - XOR/OR/AND (if they were made to survive AtomicExpand)
	// - LHS != 1
	// if the result is compared.

	SDValue CmpLHS = Cmp.getOperand(0);
	SDValue CmpRHS = Cmp.getOperand(1);

	if (!CmpLHS.hasOneUse())
	return SDValue();

	unsigned Opc = CmpLHS.getOpcode();
	if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
	return SDValue();

	SDValue OpRHS = CmpLHS.getOperand(2);
	auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
	if (!OpRHSC)
	return SDValue();

	APInt Addend = OpRHSC->getAPIntValue();
	if (Opc == ISD::ATOMIC_LOAD_SUB)
	Addend = -Addend;

	auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
	if (!CmpRHSC)
	return SDValue();

	APInt Comparison = CmpRHSC->getAPIntValue();

	// If the addend is the negation of the comparison value, then we can do
	// a full comparison by emitting the atomic arithmetic as a locked sub.
	if (Comparison == -Addend) {
	// The CC is fine, but we need to rewrite the LHS of the comparison as an
	// atomic sub.
	auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
	auto AtomicSub = DAG.getAtomic(
	ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
	/Chain/ CmpLHS.getOperand(0), /LHS/ CmpLHS.getOperand(1),
	/RHS/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
	AN->getMemOperand());
	// If the comparision uses the CF flag we can't use INC/DEC instructions.
	bool NeedCF = false;
	switch (CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	NeedCF = true;
	break;
	}
	auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// We can handle comparisons with zero in a number of cases by manipulating
	// the CC used.
	if (!Comparison.isNullValue())
	return SDValue();

	if (CC == X86::COND_S && Addend == 1)
	CC = X86::COND_LE;
	else if (CC == X86::COND_NS && Addend == 1)
	CC = X86::COND_G;
	else if (CC == X86::COND_G && Addend == -1)
	CC = X86::COND_GE;
	else if (CC == X86::COND_LE && Addend == -1)
	CC = X86::COND_L;
	else
	return SDValue();

	SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// Check whether a boolean test is testing a boolean value generated by
	// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
	// code.
	//
	// Simplify the following patterns:
	// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
	// to (Op EFLAGS Cond)
	//
	// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
	// to (Op EFLAGS !Cond)
	//
	// where Op could be BRCOND or CMOV.
	//
	static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Quit if not used as a boolean value.
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	// Check CMP operands. One of them should be 0 or 1 and the other should be
	// an SetCC or extended from it.
	SDValue Op1 = Cmp.getOperand(0);
	SDValue Op2 = Cmp.getOperand(1);

	SDValue SetCC;
	const ConstantSDNode* C = nullptr;
	bool needOppositeCond = (CC == X86::COND_E);
	bool checkAgainstTrue = false; // Is it a comparison against 1?

	if ((C = dyn_cast<ConstantSDNode>(Op1)))
	SetCC = Op2;
	else if ((C = dyn_cast<ConstantSDNode>(Op2)))
	SetCC = Op1;
	else // Quit if all operands are not constants.
	return SDValue();

	if (C->getZExtValue() == 1) {
	needOppositeCond = !needOppositeCond;
	checkAgainstTrue = true;
	} else if (C->getZExtValue() != 0)
	// Quit if the constant is neither 0 or 1.
	return SDValue();

	bool truncatedToBoolWithAnd = false;
	// Skip (zext $x), (trunc $x), or (and $x, 1) node.
	while (SetCC.getOpcode() == ISD::ZERO_EXTEND \|\|
	SetCC.getOpcode() == ISD::TRUNCATE \|\|
	SetCC.getOpcode() == ISD::AND) {
	if (SetCC.getOpcode() == ISD::AND) {
	int OpIdx = -1;
	if (isOneConstant(SetCC.getOperand(0)))
	OpIdx = 1;
	if (isOneConstant(SetCC.getOperand(1)))
	OpIdx = 0;
	if (OpIdx < 0)
	break;
	SetCC = SetCC.getOperand(OpIdx);
	truncatedToBoolWithAnd = true;
	} else
	SetCC = SetCC.getOperand(0);
	}

	switch (SetCC.getOpcode()) {
	case X86ISD::SETCC_CARRY:
	// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
	// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
	// i.e. it's a comparison against true but the result of SETCC_CARRY is not
	// truncated to i1 using 'and'.
	if (checkAgainstTrue && !truncatedToBoolWithAnd)
	break;
	assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
	"Invalid use of SETCC_CARRY!");
	LLVM_FALLTHROUGH;
	case X86ISD::SETCC:
	// Set the condition code or opposite one if necessary.
	CC = X86::CondCode(SetCC.getConstantOperandVal(0));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(1);
	case X86ISD::CMOV: {
	// Check whether false/true value has canonical one, i.e. 0 or 1.
	ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
	ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
	// Quit if true value is not a constant.
	if (!TVal)
	return SDValue();
	// Quit if false value is not a constant.
	if (!FVal) {
	SDValue Op = SetCC.getOperand(0);
	// Skip 'zext' or 'trunc' node.
	if (Op.getOpcode() == ISD::ZERO_EXTEND \|\|
	Op.getOpcode() == ISD::TRUNCATE)
	Op = Op.getOperand(0);
	// A special case for rdrand/rdseed, where 0 is set if false cond is
	// found.
	if ((Op.getOpcode() != X86ISD::RDRAND &&
	Op.getOpcode() != X86ISD::RDSEED) \|\| Op.getResNo() != 0)
	return SDValue();
	}
	// Quit if false value is not the constant 0 or 1.
	bool FValIsFalse = true;
	if (FVal && FVal->getZExtValue() != 0) {
	if (FVal->getZExtValue() != 1)
	return SDValue();
	// If FVal is 1, opposite cond is needed.
	needOppositeCond = !needOppositeCond;
	FValIsFalse = false;
	}
	// Quit if TVal is not the constant opposite of FVal.
	if (FValIsFalse && TVal->getZExtValue() != 1)
	return SDValue();
	if (!FValIsFalse && TVal->getZExtValue() != 0)
	return SDValue();
	CC = X86::CondCode(SetCC.getConstantOperandVal(2));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(3);
	}
	}

	return SDValue();
	}

	/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
	/// Match:
	/// (X86or (X86setcc) (X86setcc))
	/// (X86cmp (and (X86setcc) (X86setcc)), 0)
	static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
	X86::CondCode &CC1, SDValue &Flags,
	bool &isAnd) {
	if (Cond->getOpcode() == X86ISD::CMP) {
	if (!isNullConstant(Cond->getOperand(1)))
	return false;

	Cond = Cond->getOperand(0);
	}

	isAnd = false;

	SDValue SetCC0, SetCC1;
	switch (Cond->getOpcode()) {
	default: return false;
	case ISD::AND:
	case X86ISD::AND:
	isAnd = true;
	LLVM_FALLTHROUGH;
	case ISD::OR:
	case X86ISD::OR:
	SetCC0 = Cond->getOperand(0);
	SetCC1 = Cond->getOperand(1);
	break;
	};

	// Make sure we have SETCC nodes, using the same flags value.
	if (SetCC0.getOpcode() != X86ISD::SETCC \|\|
	SetCC1.getOpcode() != X86ISD::SETCC \|\|
	SetCC0->getOperand(1) != SetCC1->getOperand(1))
	return false;

	CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
	CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
	Flags = SetCC0->getOperand(1);
	return true;
	}

	// When legalizing carry, we create carries via add X, -1
	// If that comes from an actual carry, via setcc, we use the
	// carry directly.
	static SDValue combineCarryThroughADD(SDValue EFLAGS) {
	if (EFLAGS.getOpcode() == X86ISD::ADD) {
	if (isAllOnesConstant(EFLAGS.getOperand(1))) {
	SDValue Carry = EFLAGS.getOperand(0);
	while (Carry.getOpcode() == ISD::TRUNCATE \|\|
	Carry.getOpcode() == ISD::ZERO_EXTEND \|\|
	Carry.getOpcode() == ISD::SIGN_EXTEND \|\|
	Carry.getOpcode() == ISD::ANY_EXTEND \|\|
	(Carry.getOpcode() == ISD::AND &&
	isOneConstant(Carry.getOperand(1))))
	Carry = Carry.getOperand(0);
	if (Carry.getOpcode() == X86ISD::SETCC \|\|
	Carry.getOpcode() == X86ISD::SETCC_CARRY) {
	if (Carry.getConstantOperandVal(0) == X86::COND_B)
	return Carry.getOperand(1);
	}
	}
	}

	return SDValue();
	}

	/// Optimize an EFLAGS definition used according to the condition code \p CC
	/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
	/// uses of chain values.
	static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (CC == X86::COND_B)
	if (SDValue Flags = combineCarryThroughADD(EFLAGS))
	return Flags;

	if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
	return R;
	return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
	}

	/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
	static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	SDValue FalseOp = N->getOperand(0);
	SDValue TrueOp = N->getOperand(1);
	X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
	SDValue Cond = N->getOperand(3);

	if (CC == X86::COND_E \|\| CC == X86::COND_NE) {
	switch (Cond.getOpcode()) {
	default: break;
	case X86ISD::BSR:
	case X86ISD::BSF:
	// If operand of BSR / BSF are proven never zero, then ZF cannot be set.
	if (DAG.isKnownNeverZero(Cond.getOperand(0)))
	return (CC == X86::COND_E) ? FalseOp : TrueOp;
	}
	}

	// Try to simplify the EFLAGS and condition code operands.
	// We can't always do this as FCMOV only supports a subset of X86 cond.
	if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
	if (FalseOp.getValueType() != MVT::f80 \|\| hasFPCMov(CC)) {
	SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
	Flags};
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}

	// If this is a select between two integer constants, try to do some
	// optimizations. Note that the operands are ordered the opposite of SELECT
	// operands.
	if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
	if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
	// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
	// larger than FalseC (the false value).
	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueC, FalseC);
	std::swap(TrueOp, FalseOp);
	}

	// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
	// This is efficient for any integer data type (including i8/i16) and
	// shift amount.
	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
	Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(ShAmt, DL, MVT::i8));
	return Cond;
	}

	// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
	// for any integer data type, including i8/i16.
	if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
	FalseC->getValueType(0), Cond);
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}

	// Optimize cases that will turn into an LEA instruction. This requires
	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
	uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
	if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;

	bool isFastMultiplier = false;
	if (Diff < 10) {
	switch ((unsigned char)Diff) {
	default: break;
	case 1: // result = add base, cond
	case 2: // result = lea base( , cond*2)
	case 3: // result = lea base(cond, cond*2)
	case 4: // result = lea base( , cond*4)
	case 5: // result = lea base(cond, cond*4)
	case 8: // result = lea base( , cond*8)
	case 9: // result = lea base(cond, cond*8)
	isFastMultiplier = true;
	break;
	}
	}

	if (isFastMultiplier) {
	APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
	Cond = getSETCC(CC, Cond, DL ,DAG);
	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
	Cond);
	// Scale the condition by the difference.
	if (Diff != 1)
	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(Diff, DL, Cond.getValueType()));

	// Add the base if non-zero.
	if (FalseC->getAPIntValue() != 0)
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}
	}
	}
	}

	// Handle these cases:
	// (select (x != c), e, c) -> select (x != c), e, x),
	// (select (x == c), c, e) -> select (x == c), x, e)
	// where the c is an integer constant, and the "select" is the combination
	// of CMOV and CMP.
	//
	// The rationale for this change is that the conditional-move from a constant
	// needs two instructions, however, conditional-move from a register needs
	// only one instruction.
	//
	// CAVEAT: By replacing a constant with a symbolic value, it may obscure
	// some instruction-combining opportunities. This opt needs to be
	// postponed as late as possible.
	//
	if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
	// the DCI.xxxx conditions are provided to postpone the optimization as
	// late as possible.

	ConstantSDNode *CmpAgainst = nullptr;
	if ((Cond.getOpcode() == X86ISD::CMP \|\| Cond.getOpcode() == X86ISD::SUB) &&
	(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
	!isa<ConstantSDNode>(Cond.getOperand(0))) {

	if (CC == X86::COND_NE &&
	CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueOp, FalseOp);
	}

	if (CC == X86::COND_E &&
	CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
	SDValue Ops[] = { FalseOp, Cond.getOperand(0),
	DAG.getConstant(CC, DL, MVT::i8), Cond };
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}
	}

	// Fold and/or of setcc's to double CMOV:
	// (CMOV F, T, ((cc1 \| cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
	// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
	//
	// This combine lets us generate:
	// cmovcc1 (jcc1 if we don't have CMOV)
	// cmovcc2 (same)
	// instead of:
	// setcc1
	// setcc2
	// and/or
	// cmovne (jne if we don't have CMOV)
	// When we can't use the CMOV instruction, it might increase branch
	// mispredicts.
	// When we can use CMOV, or when there is no mispredict, this improves
	// throughput and reduces register pressure.
	//
	if (CC == X86::COND_NE) {
	SDValue Flags;
	X86::CondCode CC0, CC1;
	bool isAndSetCC;
	if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
	if (isAndSetCC) {
	std::swap(FalseOp, TrueOp);
	CC0 = X86::GetOppositeBranchCondition(CC0);
	CC1 = X86::GetOppositeBranchCondition(CC1);
	}

	SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
	Flags};
	SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
	SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
	SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	return CMOV;
	}
	}

	return SDValue();
	}

	/// Different mul shrinking modes.
	enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

	static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
	EVT VT = N->getOperand(0).getValueType();
	if (VT.getScalarSizeInBits() != 32)
	return false;

	assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
	unsigned SignBits[2] = {1, 1};
	bool IsPositive[2] = {false, false};
	for (unsigned i = 0; i < 2; i++) {
	SDValue Opd = N->getOperand(i);

	// DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
	// compute signbits for it separately.
	if (Opd.getOpcode() == ISD::ANY_EXTEND) {
	// For anyextend, it is safe to assume an appropriate number of leading
	// sign/zero bits.
	if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
	SignBits[i] = 25;
	else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
	MVT::i16)
	SignBits[i] = 17;
	else
	return false;
	IsPositive[i] = true;
	} else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
	// All the operands of BUILD_VECTOR need to be int constant.
	// Find the smallest value range which all the operands belong to.
	SignBits[i] = 32;
	IsPositive[i] = true;
	for (const SDValue &SubOp : Opd.getNode()->op_values()) {
	if (SubOp.isUndef())
	continue;
	auto *CN = dyn_cast<ConstantSDNode>(SubOp);
	if (!CN)
	return false;
	APInt IntVal = CN->getAPIntValue();
	if (IntVal.isNegative())
	IsPositive[i] = false;
	SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
	}
	} else {
	SignBits[i] = DAG.ComputeNumSignBits(Opd);
	if (Opd.getOpcode() == ISD::ZERO_EXTEND)
	IsPositive[i] = true;
	}
	}

	bool AllPositive = IsPositive[0] && IsPositive[1];
	unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
	// When ranges are from -128 ~ 127, use MULS8 mode.
	if (MinSignBits >= 25)
	Mode = MULS8;
	// When ranges are from 0 ~ 255, use MULU8 mode.
	else if (AllPositive && MinSignBits >= 24)
	Mode = MULU8;
	// When ranges are from -32768 ~ 32767, use MULS16 mode.
	else if (MinSignBits >= 17)
	Mode = MULS16;
	// When ranges are from 0 ~ 65535, use MULU16 mode.
	else if (AllPositive && MinSignBits >= 16)
	Mode = MULU16;
	else
	return false;
	return true;
	}

	/// When the operands of vector mul are extended from smaller size values,
	/// like i8 and i16, the type of mul may be shrinked to generate more
	/// efficient code. Two typical patterns are handled:
	/// Pattern1:
	/// %2 = sext/zext <N x i8> %1 to <N x i32>
	/// %4 = sext/zext <N x i8> %3 to <N x i32>
	// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// Pattern2:
	/// %2 = zext/sext <N x i16> %1 to <N x i32>
	/// %4 = zext/sext <N x i16> %3 to <N x i32>
	/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// There are four mul shrinking modes:
	/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
	/// generate pmullw+sext32 for it (MULS8 mode).
	/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
	/// generate pmullw+zext32 for it (MULU8 mode).
	/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
	/// generate pmullw+pmulhw for it (MULS16 mode).
	/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
	/// generate pmullw+pmulhuw for it (MULU16 mode).
	static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Check for legality
	// pmullw/pmulhw are not supported by SSE.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Check for profitability
	// pmulld is supported since SSE41. It is better to use pmulld
	// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
	// the expansion.
	bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
	if (Subtarget.hasSSE41() && (OptForMinSize \|\| !Subtarget.isPMULLDSlow()))
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(N, DAG, Mode))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getOperand(0).getValueType();
	unsigned NumElts = VT.getVectorNumElements();
	if ((NumElts % 2) != 0)
	return SDValue();

	// If the upper 17 bits of each element are zero then we can use PMADD.
	APInt Mask17 = APInt::getHighBitsSet(32, 17);
	if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) &&
	DAG.MaskedValueIsZero(N1, Mask17))
	return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0),
	DAG.getBitcast(MVT::v8i16, N1));

	unsigned RegSize = 128;
	MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

	// Shrink the operands of mul.
	SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
	SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

	if (NumElts >= OpsVT.getVectorNumElements()) {
	// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
	// lower part is needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
	if (Mode == MULU8 \|\| Mode == MULS8) {
	return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
	DL, VT, MulLo);
	} else {
	MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
	// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
	// the higher part is also needed.
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	ReducedVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result.
	// Generate shuffle functioning as punpcklwd.
	SmallVector<int, 16> ShuffleMask(NumElts);
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i;
	ShuffleMask[2 * i + 1] = i + NumElts;
	}
	SDValue ResLo =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResLo = DAG.getBitcast(ResVT, ResLo);
	// Generate shuffle functioning as punpckhwd.
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i + NumElts / 2;
	ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
	}
	SDValue ResHi =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResHi = DAG.getBitcast(ResVT, ResHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
	}
	} else {
	// When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
	// to legalize the mul explicitly because implicit legalization for type
	// <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
	// instructions which will not exist when we explicitly legalize it by
	// extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
	// <4 x i16> undef).
	//
	// Legalize the operands of mul.
	// FIXME: We may be able to handle non-concatenated vectors by insertion.
	unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
	if ((RegSize % ReducedSizeInBits) != 0)
	return SDValue();

	SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
	DAG.getUNDEF(ReducedVT));
	Ops[0] = NewN0;
	NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
	Ops[0] = NewN1;
	NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);

	if (Mode == MULU8 \|\| Mode == MULS8) {
	// Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
	// part is needed.
	SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);

	// convert the type of mul result to VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
	: ISD::SIGN_EXTEND_VECTOR_INREG,
	DL, ResVT, Mul);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	} else {
	// Generate the lower and higher part of mul: pmulhw/pmulhuw. For
	// MULU16/MULS16, both parts are needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	OpsVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result. Make sure the type of mul result is VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
	Res = DAG.getBitcast(ResVT, Res);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}
	}
	}

	static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
	EVT VT, SDLoc DL) {

	auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mult, DL, VT));
	Result = DAG.getNode(ISD::SHL, DL, VT, Result,
	DAG.getConstant(Shift, DL, MVT::i8));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	auto combineMulMulAddOrSub = [&](bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(9, DL, VT));
	Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	switch (MulAmt) {
	default:
	break;
	case 11:
	// mul x, 11 => add ((shl (mul x, 5), 1), x)
	return combineMulShlAddOrSub(5, 1, /isAdd/ true);
	case 21:
	// mul x, 21 => add ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ true);
	case 22:
	// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(5, 2, /isAdd/ true));
	case 19:
	// mul x, 19 => sub ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ false);
	case 13:
	// mul x, 13 => add ((shl (mul x, 3), 2), x)
	return combineMulShlAddOrSub(3, 2, /isAdd/ true);
	case 23:
	// mul x, 13 => sub ((shl (mul x, 3), 3), x)
	return combineMulShlAddOrSub(3, 3, /isAdd/ false);
	case 14:
	// mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(3, 2, /isAdd/ true));
	case 26:
	// mul x, 26 => sub ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(/isAdd/ false);
	case 28:
	// mul x, 28 => add ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(/isAdd/ true);
	case 29:
	// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulMulAddOrSub(/isAdd/ true));
	case 30:
	// mul x, 30 => sub (sub ((shl x, 5), x), x)
	return DAG.getNode(
	ISD::SUB, DL, VT,
	DAG.getNode(ISD::SUB, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(5, DL, MVT::i8)),
	N->getOperand(0)),
	N->getOperand(0));
	}
	return SDValue();
	}

	/// Optimize a single multiply with constant into two operations in order to
	/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
	static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (DCI.isBeforeLegalize() && VT.isVector())
	return reduceVMULWidth(N, DAG, Subtarget);

	if (!MulConstantOptimization)
	return SDValue();
	// An imul is usually smaller than the alternative sequence.
	if (DAG.getMachineFunction().getFunction().optForMinSize())
	return SDValue();

	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	if (VT != MVT::i64 && VT != MVT::i32)
	return SDValue();

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	uint64_t MulAmt = C->getZExtValue();
	if (isPowerOf2_64(MulAmt) \|\| MulAmt == 3 \|\| MulAmt == 5 \|\| MulAmt == 9)
	return SDValue();

	uint64_t MulAmt1 = 0;
	uint64_t MulAmt2 = 0;
	if ((MulAmt % 9) == 0) {
	MulAmt1 = 9;
	MulAmt2 = MulAmt / 9;
	} else if ((MulAmt % 5) == 0) {
	MulAmt1 = 5;
	MulAmt2 = MulAmt / 5;
	} else if ((MulAmt % 3) == 0) {
	MulAmt1 = 3;
	MulAmt2 = MulAmt / 3;
	}

	SDLoc DL(N);
	SDValue NewMul;
	if (MulAmt2 &&
	(isPowerOf2_64(MulAmt2) \|\| MulAmt2 == 3 \|\| MulAmt2 == 5 \|\| MulAmt2 == 9)){

	if (isPowerOf2_64(MulAmt2) &&
	!(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
	// If second multiplifer is pow2, issue it first. We want the multiply by
	// 3, 5, or 9 to be folded into the addressing mode unless the lone use
	// is an add.
	std::swap(MulAmt1, MulAmt2);

	if (isPowerOf2_64(MulAmt1))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(MulAmt1, DL, VT));

	if (isPowerOf2_64(MulAmt2))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
	DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
	DAG.getConstant(MulAmt2, DL, VT));
	} else if (!Subtarget.slowLEA())
	NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);

	if (!NewMul) {
	assert(MulAmt != 0 &&
	MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
	"Both cases that could cause potential overflows should have "
	"already been handled.");
	int64_t SignMulAmt = C->getSExtValue();
	if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
	(SignMulAmt != -INT64_MAX)) {
	int NumSign = SignMulAmt > 0 ? 1 : -1;
	bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
	bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
	if (IsPowerOf2_64PlusOne) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::ADD, DL, VT, N->getOperand(0),
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
	MVT::i8)));
	} else if (IsPowerOf2_64MinusOne) {
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::SUB, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
	MVT::i8)),
	N->getOperand(0));
	}
	// To negate, subtract the number from zero
	if ((IsPowerOf2_64PlusOne \|\| IsPowerOf2_64MinusOne) && NumSign == -1)
	NewMul =
	DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
	}
	}

	if (NewMul)
	// Do not add new nodes to DAG combiner worklist.
	DCI.CombineTo(N, NewMul, false);

	return SDValue();
	}

	static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	EVT VT = N0.getValueType();

	// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
	// since the result of setcc_c is all zero's or all ones.
	if (VT.isInteger() && !VT.isVector() &&
	N1C && N0.getOpcode() == ISD::AND &&
	N0.getOperand(1).getOpcode() == ISD::Constant) {
	SDValue N00 = N0.getOperand(0);
	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
	Mask <<= N1C->getAPIntValue();
	bool MaskOK = false;
	// We can handle cases concerning bit-widening nodes containing setcc_c if
	// we carefully interrogate the mask to make sure we are semantics
	// preserving.
	// The transform is not safe if the result of C1 << C2 exceeds the bitwidth
	// of the underlying setcc_c operation if the setcc_c was zero extended.
	// Consider the following example:
	// zext(setcc_c) -> i32 0x0000FFFF
	// c1 -> i32 0x0000FFFF
	// c2 -> i32 0x00000001
	// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
	// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if ((N00.getOpcode() == ISD::ZERO_EXTEND \|\|
	N00.getOpcode() == ISD::ANY_EXTEND) &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
	}
	if (MaskOK && Mask != 0) {
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
	}
	}

	// Hardware support for vector shifts is sparse which makes us scalarize the
	// vector operations in many cases. Also, on sandybridge ADD is faster than
	// shl.
	// (shl V, 1) -> add V,V
	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
	if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
	assert(N0.getValueType().isVector() && "Invalid vector shift type");
	// We shift all of the values by one. In many cases we do not have
	// hardware support for this operation. This is better expressed as an ADD
	// of two values.
	if (N1SplatC->getAPIntValue() == 1)
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
	}

	return SDValue();
	}

	static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned Size = VT.getSizeInBits();

	// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
	// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
	// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
	// depending on sign of (SarConst - [56,48,32,24,16])

	// sexts in X86 are MOVs. The MOVs have the same code size
	// as above SHIFTs (only SHIFT on 1 has lower code size).
	// However the MOVs have 2 advantages to a SHIFT:
	// 1. MOVs can write to a register that differs from source
	// 2. MOVs accept memory operands

	if (VT.isVector() \|\| N1.getOpcode() != ISD::Constant \|\|
	N0.getOpcode() != ISD::SHL \|\| !N0.hasOneUse() \|\|
	N0.getOperand(1).getOpcode() != ISD::Constant)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
	APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
	EVT CVT = N1.getValueType();

	if (SarConst.isNegative())
	return SDValue();

	for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
	unsigned ShiftSize = SVT.getSizeInBits();
	// skipping types without corresponding sext/zext and
	// ShlConst that is not one of [56,48,32,24,16]
	if (ShiftSize >= Size \|\| ShlConst != Size - ShiftSize)
	continue;
	SDLoc DL(N);
	SDValue NN =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
	SarConst = SarConst - (Size - ShiftSize);
	if (SarConst == 0)
	return NN;
	else if (SarConst.isNegative())
	return DAG.getNode(ISD::SHL, DL, VT, NN,
	DAG.getConstant(-SarConst, DL, CVT));
	else
	return DAG.getNode(ISD::SRA, DL, VT, NN,
	DAG.getConstant(SarConst, DL, CVT));
	}
	return SDValue();
	}

	static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
	// TODO: This is a generic DAG combine that became an x86-only combine to
	// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
	// and-not ('andn').
	if (N0.getOpcode() != ISD::AND \|\| !N0.hasOneUse())
	return SDValue();

	auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
	auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!ShiftC \|\| !AndC)
	return SDValue();

	// If we can shrink the constant mask below 8-bits or 32-bits, then this
	// transform should reduce code size. It may also enable secondary transforms
	// from improved known-bits analysis or instruction selection.
	APInt MaskVal = AndC->getAPIntValue();
	APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
	unsigned OldMaskSize = MaskVal.getMinSignedBits();
	unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
	if ((OldMaskSize > 8 && NewMaskSize <= 8) \|\|
	(OldMaskSize > 32 && NewMaskSize <= 32)) {
	// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
	SDLoc DL(N);
	SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
	SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
	return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
	}
	return SDValue();
	}

	static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (N->getOpcode() == ISD::SHL)
	if (SDValue V = combineShiftLeft(N, DAG))
	return V;

	if (N->getOpcode() == ISD::SRA)
	if (SDValue V = combineShiftRightArithmetic(N, DAG))
	return V;

	if (N->getOpcode() == ISD::SRL)
	if (SDValue V = combineShiftRightLogical(N, DAG))
	return V;

	return SDValue();
	}

	static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::PACKSS == Opcode \|\| X86ISD::PACKUS == Opcode) &&
	"Unexpected shift opcode");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned DstBitsPerElt = VT.getScalarSizeInBits();
	unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
	assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
	N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
	"Unexpected PACKSS/PACKUS input type");

	// Constant Folding.
	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if ((N0->isUndef() \|\| N->isOnlyUserOf(N0.getNode())) &&
	(N1->isUndef() \|\| N->isOnlyUserOf(N1.getNode())) &&
	getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
	getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumDstElts = VT.getVectorNumElements();
	unsigned NumSrcElts = NumDstElts / 2;
	unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
	unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
	bool IsSigned = (X86ISD::PACKSS == Opcode);

	APInt Undefs(NumDstElts, 0);
	SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
	unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
	auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
	auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

	if (UndefElts[SrcIdx]) {
	Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
	continue;
	}

	APInt &Val = EltBits[SrcIdx];
	if (IsSigned) {
	// PACKSS: Truncate signed value with signed saturation.
	// Source values less than dst minint are saturated to minint.
	// Source values greater than dst maxint are saturated to maxint.
	if (Val.isSignedIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getSignedMinValue(DstBitsPerElt);
	else
	Val = APInt::getSignedMaxValue(DstBitsPerElt);
	} else {
	// PACKUS: Truncate signed value with unsigned saturation.
	// Source values less than zero are saturated to zero.
	// Source values greater than dst maxuint are saturated to maxuint.
	if (Val.isIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getNullValue(DstBitsPerElt);
	else
	Val = APInt::getAllOnesValue(DstBitsPerElt);
	}
	Bits[Lane * NumDstEltsPerLane + Elt] = Val;
	}
	}

	return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	// Attempt to combine as shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}

	return SDValue();
	}

	static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\|
	X86ISD::VSRLI == Opcode) &&
	"Unexpected shift opcode");
	bool LogicalShift = X86ISD::VSHLI == Opcode \|\| X86ISD::VSRLI == Opcode;
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
	"Unexpected value type");

	// Out of range logical bit shifts are guaranteed to be zero.
	// Out of range arithmetic bit shifts splat the sign bit.
	APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
	if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
	if (LogicalShift)
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
	else
	ShiftVal = NumBitsPerElt - 1;
	}

	// Shift N0 by zero -> N0.
	if (!ShiftVal)
	return N0;

	// Shift zero -> zero.
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));

	// fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
	// This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
	// TODO - support other sra opcodes as needed.
	if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
	N0.getOpcode() == X86ISD::VSRAI)
	return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);

	// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
	if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
	N1 == N0.getOperand(1)) {
	SDValue N00 = N0.getOperand(0);
	unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
	if (ShiftVal.ult(NumSignBits))
	return N00;
	}

	// We can decode 'whole byte' logical bit shifts as shuffles.
	if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}
	}

	// Constant Folding.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (N->isOnlyUserOf(N0.getNode()) &&
	getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
	assert(EltBits.size() == VT.getVectorNumElements() &&
	"Unexpected shift value type");
	unsigned ShiftImm = ShiftVal.getZExtValue();
	for (APInt &Elt : EltBits) {
	if (X86ISD::VSHLI == Opcode)
	Elt <<= ShiftImm;
	else if (X86ISD::VSRAI == Opcode)
	Elt.ashrInPlace(ShiftImm);
	else
	Elt.lshrInPlace(ShiftImm);
	}
	return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	return SDValue();
	}

	static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(
	((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) \|\|
	(N->getOpcode() == X86ISD::PINSRW &&
	N->getValueType(0) == MVT::v8i16)) &&
	"Unexpected vector insertion");

	// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}

	return SDValue();
	}

	/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
	/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
	/// OR -> CMPNEQSS.
	static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned opcode;

	// SSE1 supports CMP{eq\|ne}SS, and SSE2 added CMP{eq\|ne}SD, but
	// we're requiring SSE2 for both.
	if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CMP0 = N0->getOperand(1);
	SDValue CMP1 = N1->getOperand(1);
	SDLoc DL(N);

	// The SETCCs should both refer to the same CMP.
	if (CMP0.getOpcode() != X86ISD::CMP \|\| CMP0 != CMP1)
	return SDValue();

	SDValue CMP00 = CMP0->getOperand(0);
	SDValue CMP01 = CMP0->getOperand(1);
	EVT VT = CMP00.getValueType();

	if (VT == MVT::f32 \|\| VT == MVT::f64) {
	bool ExpectingFlags = false;
	// Check for any users that want flags:
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	!ExpectingFlags && UI != UE; ++UI)
	switch (UI->getOpcode()) {
	default:
	case ISD::BR_CC:
	case ISD::BRCOND:
	case ISD::SELECT:
	ExpectingFlags = true;
	break;
	case ISD::CopyToReg:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	}

	if (!ExpectingFlags) {
	enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
	enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

	if (cc1 == X86::COND_E \|\| cc1 == X86::COND_NE) {
	X86::CondCode tmp = cc0;
	cc0 = cc1;
	cc1 = tmp;
	}

	if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) \|\|
	(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
	// FIXME: need symbolic constants for these magic numbers.
	// See X86ATTInstPrinter.cpp:printSSECC().
	unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
	if (Subtarget.hasAVX512()) {
	SDValue FSetCC =
	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
	DAG.getConstant(x86cc, DL, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
	N->getSimpleValueType(0), FSetCC,
	DAG.getIntPtrConstant(0, DL));
	}
	SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
	CMP00.getValueType(), CMP00, CMP01,
	DAG.getConstant(x86cc, DL,
	MVT::i8));

	bool is64BitFP = (CMP00.getValueType() == MVT::f64);
	MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

	if (is64BitFP && !Subtarget.is64Bit()) {
	// On a 32-bit target, we cannot bitcast the 64-bit float to a
	// 64-bit integer, since that's not a legal type. Since
	// OnesOrZeroesF is all ones of all zeroes, we don't need all the
	// bits, but can do this little dance to extract the lowest 32 bits
	// and work with those going forward.
	SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	OnesOrZeroesF);
	SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
	OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
	Vector32, DAG.getIntPtrConstant(0, DL));
	IntVT = MVT::i32;
	}

	SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
	SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
	DAG.getConstant(1, DL, IntVT));
	SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	ANDed);
	return OneBitOfTruth;
	}
	}
	}
	}
	return SDValue();
	}

	/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
	static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::AND);

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
	return SDValue();

	if (N0.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
	return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);

	if (N1.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
	return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
	// register. In most cases we actually compare or select YMM-sized registers
	// and mixing the two types creates horrible code. This method optimizes
	// some of the transition sequences.
	// Even with AVX-512 this is still useful for removing casts around logical
	// operations on vXi1 mask types.
	static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	assert(VT.isVector() && "Expected vector type");

	assert((N->getOpcode() == ISD::ANY_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND \|\|
	N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");

	SDValue Narrow = N->getOperand(0);
	EVT NarrowVT = Narrow.getValueType();

	if (Narrow->getOpcode() != ISD::XOR &&
	Narrow->getOpcode() != ISD::AND &&
	Narrow->getOpcode() != ISD::OR)
	return SDValue();

	SDValue N0 = Narrow->getOperand(0);
	SDValue N1 = Narrow->getOperand(1);
	SDLoc DL(Narrow);

	// The Left side has to be a trunc.
	if (N0.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	// The type of the truncated inputs.
	if (N0->getOperand(0).getValueType() != VT)
	return SDValue();

	// The right side has to be a 'trunc' or a constant vector.
	bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getValueType() == VT;
	if (!RHSTrunc &&
	!ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
	return SDValue();

	// Set N0 and N1 to hold the inputs to the new wide operation.
	N0 = N0->getOperand(0);
	if (RHSTrunc)
	N1 = N1->getOperand(0);
	else
	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

	// Generate the wide operation.
	SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::ANY_EXTEND:
	return Op;
	case ISD::ZERO_EXTEND:
	return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
	case ISD::SIGN_EXTEND:
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
	Op, DAG.getValueType(NarrowVT));
	}
	}

	/// If both input operands of a logic op are being cast from floating point
	/// types, try to convert this into a floating point logic node to avoid
	/// unnecessary moves from SSE to integer registers.
	static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned FPOpcode = ISD::DELETED_NODE;
	if (N->getOpcode() == ISD::AND)
	FPOpcode = X86ISD::FAND;
	else if (N->getOpcode() == ISD::OR)
	FPOpcode = X86ISD::FOR;
	else if (N->getOpcode() == ISD::XOR)
	FPOpcode = X86ISD::FXOR;

	assert(FPOpcode != ISD::DELETED_NODE &&
	"Unexpected input node for FP logic conversion");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);
	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
	((Subtarget.hasSSE1() && VT == MVT::i32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::i64))) {
	SDValue N00 = N0.getOperand(0);
	SDValue N10 = N1.getOperand(0);
	EVT N00Type = N00.getValueType();
	EVT N10Type = N10.getValueType();
	if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
	SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
	return DAG.getBitcast(VT, FPLogic);
	}
	}
	return SDValue();
	}

	/// If this is a zero/all-bits result that is bitwise-anded with a low bits
	/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
	/// with a shift-right to eliminate loading the vector constant mask value.
	static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
	SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
	EVT VT0 = Op0.getValueType();
	EVT VT1 = Op1.getValueType();

	if (VT0 != VT1 \|\| !VT0.isSimple() \|\| !VT0.isInteger())
	return SDValue();

	APInt SplatVal;
	if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) \|\|
	!SplatVal.isMask())
	return SDValue();

	if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
	return SDValue();

	unsigned EltBitWidth = VT0.getScalarSizeInBits();
	if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
	return SDValue();

	SDLoc DL(N);
	unsigned ShiftVal = SplatVal.countTrailingOnes();
	SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
	SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
	return DAG.getBitcast(N->getValueType(0), Shift);
	}

	// Get the index node from the lowered DAG of a GEP IR instruction with one
	// indexing dimension.
	static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
	if (Ld->isIndexed())
	return SDValue();

	SDValue Base = Ld->getBasePtr();

	if (Base.getOpcode() != ISD::ADD)
	return SDValue();

	SDValue ShiftedIndex = Base.getOperand(0);

	if (ShiftedIndex.getOpcode() != ISD::SHL)
	return SDValue();

	return ShiftedIndex.getOperand(0);

	}

	static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
	if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
	switch (VT.getSizeInBits()) {
	default: return false;
	case 64: return Subtarget.is64Bit() ? true : false;
	case 32: return true;
	}
	}
	return false;
	}

	// This function recognizes cases where X86 bzhi instruction can replace and
	// 'and-load' sequence.
	// In case of loading integer value from an array of constants which is defined
	// as follows:
	//
	// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
	//
	// then applying a bitwise and on the result with another input.
	// It's equivalent to performing bzhi (zero high bits) on the input, with the
	// same index of the load.
	static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Node->getSimpleValueType(0);
	SDLoc dl(Node);

	// Check if subtarget has BZHI instruction for the node's type
	if (!hasBZHI(Subtarget, VT))
	return SDValue();

	// Try matching the pattern for both operands.
	for (unsigned i = 0; i < 2; i++) {
	SDValue N = Node->getOperand(i);
	LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

	// continue if the operand is not a load instruction
	if (!Ld)
	return SDValue();

	const Value *MemOp = Ld->getMemOperand()->getValue();

	if (!MemOp)
	return SDValue();

	if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
	if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
	if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

	Constant *Init = GV->getInitializer();
	Type *Ty = Init->getType();
	if (!isa<ConstantDataArray>(Init) \|\|
	!Ty->getArrayElementType()->isIntegerTy() \|\|
	Ty->getArrayElementType()->getScalarSizeInBits() !=
	VT.getSizeInBits() \|\|
	Ty->getArrayNumElements() >
	Ty->getArrayElementType()->getScalarSizeInBits())
	continue;

	// Check if the array's constant elements are suitable to our case.
	uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
	bool ConstantsMatch = true;
	for (uint64_t j = 0; j < ArrayElementCount; j++) {
	ConstantInt *Elem =
	dyn_cast<ConstantInt>(Init->getAggregateElement(j));
	if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
	ConstantsMatch = false;
	break;
	}
	}
	if (!ConstantsMatch)
	continue;

	// Do the transformation (For 32-bit type):
	// -> (and (load arr[idx]), inp)
	// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
	// that will be replaced with one bzhi instruction.
	SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
	SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);

	// Get the Node which indexes into the array.
	SDValue Index = getIndexFromUnindexedLoad(Ld);
	if (!Index)
	return SDValue();
	Index = DAG.getZExtOrTrunc(Index, dl, VT);

	SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);

	SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
	SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

	return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
	}
	}
	}
	}
	return SDValue();
	}

	static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FAND to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
	return R;

	if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
	return ShiftRight;

	if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
	return R;

	// Attempt to recursively combine a bitmask AND with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}
	}

	// Attempt to combine a scalar bitmask AND with an extracted shuffle.
	if ((VT.getScalarSizeInBits() % 8) == 0 &&
	N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
	SDValue BitMask = N->getOperand(1);
	SDValue SrcVec = N->getOperand(0).getOperand(0);
	EVT SrcVecVT = SrcVec.getValueType();

	// Check that the constant bitmask masks whole bytes.
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (VT == SrcVecVT.getScalarType() &&
	N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
	getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
	llvm::all_of(EltBits, [](APInt M) {
	return M.isNullValue() \|\| M.isAllOnesValue();
	})) {
	unsigned NumElts = SrcVecVT.getVectorNumElements();
	unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
	unsigned Idx = N->getOperand(0).getConstantOperandVal(1);

	// Create a root shuffle mask from the byte mask and the extracted index.
	SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
	for (unsigned i = 0; i != Scale; ++i) {
	if (UndefElts[i])
	continue;
	int VecIdx = Scale * Idx + i;
	ShuffleMask[VecIdx] =
	EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
	}

	if (SDValue Shuffle = combineX86ShufflesRecursively(
	{SrcVec}, 0, SrcVec, ShuffleMask, {}, /Depth/ 2,
	/HasVarMask/ false, DAG, DCI, Subtarget))
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
	N->getOperand(0).getOperand(1));
	}
	}

	return SDValue();
	}

	// Try to fold:
	// (or (and (m, y), (pandn m, x)))
	// into:
	// (vselect m, x, y)
	// As a special case, try to fold:
	// (or (and (m, (sub 0, x)), (pandn m, x)))
	// into:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	if (!((VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256())))
	return SDValue();

	// Canonicalize AND to LHS.
	if (N1.getOpcode() == ISD::AND)
	std::swap(N0, N1);

	// TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
	// ANDNP combine allows other combines to happen that prevent matching.
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != X86ISD::ANDNP)
	return SDValue();

	SDValue Mask = N1.getOperand(0);
	SDValue X = N1.getOperand(1);
	SDValue Y;
	if (N0.getOperand(0) == Mask)
	Y = N0.getOperand(1);
	if (N0.getOperand(1) == Mask)
	Y = N0.getOperand(0);

	// Check to see if the mask appeared in both the AND and ANDNP.
	if (!Y.getNode())
	return SDValue();

	// Validate that X, Y, and Mask are bitcasts, and see through them.
	Mask = peekThroughBitcasts(Mask);
	X = peekThroughBitcasts(X);
	Y = peekThroughBitcasts(Y);

	EVT MaskVT = Mask.getValueType();
	unsigned EltBits = MaskVT.getScalarSizeInBits();

	// TODO: Attempt to handle floating point cases as well?
	if (!MaskVT.isInteger() \|\| DAG.ComputeNumSignBits(Mask) != EltBits)
	return SDValue();

	SDLoc DL(N);

	// Try to match:
	// (or (and (M, (sub 0, X)), (pandn M, X)))
	// which is a special case of vselect:
	// (vselect M, (sub 0, X), X)
	// Per:
	// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
	// We know that, if fNegate is 0 or 1:
	// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
	//
	// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
	// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
	// ( M ? -X : X) == ((X ^ M ) + (M & 1))
	// This lets us transform our vselect to:
	// (add (xor X, M), (and M, 1))
	// And further to:
	// (sub (xor X, M), M)
	if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
	DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
	auto IsNegV = [](SDNode *N, SDValue V) {
	return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
	ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
	};
	SDValue V;
	if (IsNegV(Y.getNode(), X))
	V = X;
	else if (IsNegV(X.getNode(), Y))
	V = Y;

	if (V) {
	SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
	SDValue SubOp2 = Mask;

	// If the negate was on the false side of the select, then
	// the operands of the SUB need to be swapped. PR 27251.
	// This is because the pattern being matched above is
	// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
	// but if the pattern matched was
	// (vselect M, X, (sub (0, X))), that is really negation of the pattern
	// above, -(vselect M, (sub 0, X), X), and therefore the replacement
	// pattern also needs to be a negation of the replacement pattern above.
	// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
	// sub accomplishes the negation of the replacement pattern.
	if (V == Y)
	std::swap(SubOp1, SubOp2);

	SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
	return DAG.getBitcast(VT, Res);
	}
	}

	// PBLENDVB is only available on SSE 4.1.
	if (!Subtarget.hasSSE41())
	return SDValue();

	MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;

	X = DAG.getBitcast(BlendVT, X);
	Y = DAG.getBitcast(BlendVT, Y);
	Mask = DAG.getBitcast(BlendVT, Mask);
	Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
	return DAG.getBitcast(VT, Mask);
	}

	// Helper function for combineOrCmpEqZeroToCtlzSrl
	// Transforms:
	// seteq(cmp x, 0)
	// into:
	// srl(ctlz x), log2(bitsize(x))
	// Input pattern is checked by caller.
	static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
	SelectionDAG &DAG) {
	SDValue Cmp = Op.getOperand(1);
	EVT VT = Cmp.getOperand(0).getValueType();
	unsigned Log2b = Log2_32(VT.getSizeInBits());
	SDLoc dl(Op);
	SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
	// The result of the shift is true or false, and on X86, the 32-bit
	// encoding of shr and lzcnt is more desirable.
	SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
	SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
	DAG.getConstant(Log2b, dl, VT));
	return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
	}

	// Try to transform:
	// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
	// into:
	// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
	// Will also attempt to match more generic cases, eg:
	// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
	// Only applies if the target supports the FastLZCNT feature.
	static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize() \|\| !Subtarget.getTargetLowering()->isCtlzFast())
	return SDValue();

	auto isORCandidate = [](SDValue N) {
	return (N->getOpcode() == ISD::OR && N->hasOneUse());
	};

	// Check the zero extend is extending to 32-bit or more. The code generated by
	// srl(ctlz) for 16-bit or less variants of the pattern would require extra
	// instructions to clear the upper bits.
	if (!N->hasOneUse() \|\| !N->getSimpleValueType(0).bitsGE(MVT::i32) \|\|
	!isORCandidate(N->getOperand(0)))
	return SDValue();

	// Check the node matches: setcc(eq, cmp 0)
	auto isSetCCCandidate = [](SDValue N) {
	return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
	X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
	N->getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(N->getOperand(1).getOperand(1)) &&
	N->getOperand(1).getValueType().bitsGE(MVT::i32);
	};

	SDNode *OR = N->getOperand(0).getNode();
	SDValue LHS = OR->getOperand(0);
	SDValue RHS = OR->getOperand(1);

	// Save nodes matching or(or, setcc(eq, cmp 0)).
	SmallVector<SDNode *, 2> ORNodes;
	while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
	ORNodes.push_back(OR);
	OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	}

	// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
	if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	!isORCandidate(SDValue(OR, 0)))
	return SDValue();

	// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
	// to
	// or(srl(ctlz),srl(ctlz)).
	// The dag combiner can then fold it into:
	// srl(or(ctlz, ctlz)).
	EVT VT = OR->getValueType(0);
	SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
	SDValue Ret, NewRHS;
	if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);

	if (!Ret)
	return SDValue();

	// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
	while (ORNodes.size() > 0) {
	OR = ORNodes.pop_back_val();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
	if (RHS->getOpcode() == ISD::OR)
	std::swap(LHS, RHS);
	EVT VT = OR->getValueType(0);
	SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
	if (!NewRHS)
	return SDValue();
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
	}

	if (Ret)
	Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

	return Ret;
	}

	static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(MVT::v4i32,
	DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N0),
	DAG.getBitcast(MVT::v4f32, N1)));
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
	return R;

	if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	// fold (or (x << c) \| (y >> (64 - c))) ==> (shld64 x, y, c)
	bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();

	// SHLD/SHRD instructions have lower register pressure, but on some
	// platforms they have higher latency than the equivalent
	// series of shifts/or that would otherwise be generated.
	// Don't fold (or (x << c) \| (y >> (64 - c))) if SHLD/SHRD instructions
	// have higher latencies and we are not optimizing for size.
	if (!OptForSize && Subtarget.isSHLDSlow())
	return SDValue();

	if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() != ISD::SHL \|\| N1.getOpcode() != ISD::SRL)
	return SDValue();
	if (!N0.hasOneUse() \|\| !N1.hasOneUse())
	return SDValue();

	SDValue ShAmt0 = N0.getOperand(1);
	if (ShAmt0.getValueType() != MVT::i8)
	return SDValue();
	SDValue ShAmt1 = N1.getOperand(1);
	if (ShAmt1.getValueType() != MVT::i8)
	return SDValue();
	if (ShAmt0.getOpcode() == ISD::TRUNCATE)
	ShAmt0 = ShAmt0.getOperand(0);
	if (ShAmt1.getOpcode() == ISD::TRUNCATE)
	ShAmt1 = ShAmt1.getOperand(0);

	SDLoc DL(N);
	unsigned Opc = X86ISD::SHLD;
	SDValue Op0 = N0.getOperand(0);
	SDValue Op1 = N1.getOperand(0);
	if (ShAmt0.getOpcode() == ISD::SUB \|\|
	ShAmt0.getOpcode() == ISD::XOR) {
	Opc = X86ISD::SHRD;
	std::swap(Op0, Op1);
	std::swap(ShAmt0, ShAmt1);
	}

	// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
	// OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
	// OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
	// OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
	unsigned Bits = VT.getSizeInBits();
	if (ShAmt1.getOpcode() == ISD::SUB) {
	SDValue Sum = ShAmt1.getOperand(0);
	if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
	SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
	if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op1 = ShAmt1Op1.getOperand(0);
	if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
	return DAG.getNode(Opc, DL, VT,
	Op0, Op1,
	DAG.getNode(ISD::TRUNCATE, DL,
	MVT::i8, ShAmt0));
	}
	} else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
	ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
	if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
	return DAG.getNode(Opc, DL, VT,
	N0.getOperand(0), N1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL,
	MVT::i8, ShAmt0));
	} else if (ShAmt1.getOpcode() == ISD::XOR) {
	SDValue Mask = ShAmt1.getOperand(1);
	if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
	unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
	SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
	if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op0 = ShAmt1Op0.getOperand(0);
	if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
	if (Op1.getOpcode() == InnerShift &&
	isa<ConstantSDNode>(Op1.getOperand(1)) &&
	Op1.getConstantOperandVal(1) == 1) {
	return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
	}
	// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
	if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
	Op1.getOperand(0) == Op1.getOperand(1)) {
	return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
	}
	}
	}
	}

	return SDValue();
	}

	/// Try to turn tests against the signbit in the form of:
	/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
	/// into:
	/// SETGT(X, -1)
	static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
	// This is only worth doing if the output type is i8 or i1.
	EVT ResultType = N->getValueType(0);
	if (ResultType != MVT::i8 && ResultType != MVT::i1)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// We should be performing an xor against a truncated shift.
	if (N0.getOpcode() != ISD::TRUNCATE \|\| !N0.hasOneUse())
	return SDValue();

	// Make sure we are performing an xor against one.
	if (!isOneConstant(N1))
	return SDValue();

	// SetCC on x86 zero extends so only act on this if it's a logical shift.
	SDValue Shift = N0.getOperand(0);
	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse())
	return SDValue();

	// Make sure we are truncating from one of i16, i32 or i64.
	EVT ShiftTy = Shift.getValueType();
	if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
	return SDValue();

	// Make sure the shift amount extracts the sign bit.
	if (!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
	return SDValue();

	// Create a greater-than comparison against -1.
	// N.B. Using SETGE against 0 works but we want a canonical looking
	// comparison, using SETGT matches up with what TranslateX86CC.
	SDLoc DL(N);
	SDValue ShiftOp = Shift.getOperand(0);
	EVT ShiftOpTy = ShiftOp.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), ResultType);
	SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
	DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
	if (SetCCResultType != ResultType)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
	return Cond;
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// pcmpgt X, -1
	///
	/// This should be called before type legalization because the pattern may not
	/// persist after that.
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isSimple())
	return SDValue();

	switch (VT.getSimpleVT().SimpleTy) {
	default: return SDValue();
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
	case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
	}

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != ISD::SRA \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
	if (!ShiftBV)
	return SDValue();

	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
	auto *ShiftAmt = ShiftBV->getConstantSplatNode();
	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
	return SDValue();

	// Create a greater-than comparison against -1. We don't use the more obvious
	// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
	return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
	}

	/// Check if truncation with saturation form type \p SrcVT to \p DstVT
	/// is valid for the given \p Subtarget.
	static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX512())
	return false;

	// FIXME: Scalar type may be supported if we move it to vector register.
	if (!SrcVT.isVector() \|\| !SrcVT.isSimple() \|\| SrcVT.getSizeInBits() > 512)
	return false;

	EVT SrcElVT = SrcVT.getScalarType();
	EVT DstElVT = DstVT.getScalarType();
	if (SrcElVT.getSizeInBits() < 16 \|\| SrcElVT.getSizeInBits() > 64)
	return false;
	if (DstElVT.getSizeInBits() < 8 \|\| DstElVT.getSizeInBits() > 32)
	return false;
	if (SrcVT.is512BitVector() \|\| Subtarget.hasVLX())
	return SrcElVT.getSizeInBits() >= 32 \|\| Subtarget.hasBWI();
	return false;
	}

	/// Detect a pattern of truncation with saturation:
	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectUSatPattern(SDValue In, EVT VT) {
	if (In.getOpcode() != ISD::UMIN)
	return SDValue();

	//Saturation with truncation. We truncate from InVT to VT.
	assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
	"Unexpected types for truncate operation");

	APInt C;
	if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
	// C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
	// the element size of the destination type.
	return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
	SDValue();
	}
	return SDValue();
	}

	/// Detect a pattern of truncation with saturation:
	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// The types should allow to use VPMOVUS* instruction on AVX512.
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
	const X86Subtarget &Subtarget) {
	if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return SDValue();
	return detectUSatPattern(In, VT);
	}

	static SDValue
	combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(In.getValueType()) \|\| !TLI.isTypeLegal(VT))
	return SDValue();
	if (auto USatVal = detectUSatPattern(In, VT))
	if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
	return SDValue();
	}

	/// This function detects the AVG pattern between vectors of unsigned i8/i16,
	/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
	/// X86ISD::AVG instruction.
	static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector() \|\| !VT.isSimple())
	return SDValue();
	EVT InVT = In.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	EVT ScalarVT = VT.getVectorElementType();
	if (!((ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16) &&
	isPowerOf2_32(NumElems)))
	return SDValue();

	// InScalarVT is the intermediate type in AVG pattern and it should be greater
	// than the original input type (i8/i16).
	EVT InScalarVT = InVT.getVectorElementType();
	if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();

	// Detect the following pattern:
	//
	// %1 = zext <N x i8> %a to <N x i32>
	// %2 = zext <N x i8> %b to <N x i32>
	// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
	// %4 = add nuw nsw <N x i32> %3, %2
	// %5 = lshr <N x i32> %N, <i32 1 x N>
	// %6 = trunc <N x i32> %5 to <N x i8>
	//
	// In AVX512, the last instruction can also be a trunc store.
	if (In.getOpcode() != ISD::SRL)
	return SDValue();

	// A lambda checking the given SDValue is a constant vector and each element
	// is in the range [Min, Max].
	auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| !BV->isConstant())
	return false;
	for (SDValue Op : V->ops()) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return false;
	uint64_t Val = C->getZExtValue();
	if (Val < Min \|\| Val > Max)
	return false;
	}
	return true;
	};

	// Split vectors to legal target size and apply AVG.
	auto LowerToAVG = [&](SDValue Op0, SDValue Op1) {
	unsigned NumSubs = 1;
	if (Subtarget.hasBWI()) {
	if (VT.getSizeInBits() > 512)
	NumSubs = VT.getSizeInBits() / 512;
	} else if (Subtarget.hasAVX2()) {
	if (VT.getSizeInBits() > 256)
	NumSubs = VT.getSizeInBits() / 256;
	} else {
	if (VT.getSizeInBits() > 128)
	NumSubs = VT.getSizeInBits() / 128;
	}

	if (NumSubs == 1)
	return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1);

	SmallVector<SDValue, 4> Subs;
	EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
	VT.getVectorNumElements() / NumSubs);
	for (unsigned i = 0; i != NumSubs; ++i) {
	unsigned Idx = i * SubVT.getVectorNumElements();
	SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits());
	SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits());
	Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
	};

	// Check if each element of the vector is left-shifted by one.
	auto LHS = In.getOperand(0);
	auto RHS = In.getOperand(1);
	if (!IsConstVectorInRange(RHS, 1, 1))
	return SDValue();
	if (LHS.getOpcode() != ISD::ADD)
	return SDValue();

	// Detect a pattern of a + b + 1 where the order doesn't matter.
	SDValue Operands[3];
	Operands[0] = LHS.getOperand(0);
	Operands[1] = LHS.getOperand(1);

	// Take care of the case when one of the operands is a constant vector whose
	// element is in the range [1, 256].
	if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
	Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
	Operands[0].getOperand(0).getValueType() == VT) {
	// The pattern is detected. Subtract one from the constant vector, then
	// demote it and emit X86ISD::AVG instruction.
	SDValue VecOnes = DAG.getConstant(1, DL, InVT);
	Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
	Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
	return LowerToAVG(Operands[0].getOperand(0), Operands[1]);
	}

	if (Operands[0].getOpcode() == ISD::ADD)
	std::swap(Operands[0], Operands[1]);
	else if (Operands[1].getOpcode() != ISD::ADD)
	return SDValue();
	Operands[2] = Operands[1].getOperand(0);
	Operands[1] = Operands[1].getOperand(1);

	// Now we have three operands of two additions. Check that one of them is a
	// constant vector with ones, and the other two are promoted from i8/i16.
	for (int i = 0; i < 3; ++i) {
	if (!IsConstVectorInRange(Operands[i], 1, 1))
	continue;
	std::swap(Operands[i], Operands[2]);

	// Check if Operands[0] and Operands[1] are results of type promotion.
	for (int j = 0; j < 2; ++j)
	if (Operands[j].getOpcode() != ISD::ZERO_EXTEND \|\|
	Operands[j].getOperand(0).getValueType() != VT)
	return SDValue();

	// The pattern is detected, emit X86ISD::AVG instruction.
	return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0));
	}

	return SDValue();
	}

	static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	EVT RegVT = Ld->getValueType(0);
	EVT MemVT = Ld->getMemoryVT();
	SDLoc dl(Ld);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// For chips with slow 32-byte unaligned loads, break the 32-byte operation
	// into two 16-byte operations. Also split non-temporal aligned loads on
	// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
	ISD::LoadExtType Ext = Ld->getExtensionType();
	bool Fast;
	unsigned AddressSpace = Ld->getAddressSpace();
	unsigned Alignment = Ld->getAlignment();
	if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
	Ext == ISD::NON_EXTLOAD &&
	((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) \|\|
	(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
	AddressSpace, Alignment, &Fast) && !Fast))) {
	unsigned NumElems = RegVT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	SDValue Ptr = Ld->getBasePtr();

	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	NumElems/2);
	SDValue Load1 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
	Alignment, Ld->getMemOperand()->getFlags());

	Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
	SDValue Load2 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
	Ld->getPointerInfo().getWithOffset(16),
	MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	Load1.getValue(1),
	Load2.getValue(1));

	SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
	return DCI.CombineTo(N, NewVec, TF, true);
	}

	return SDValue();
	}

	/// If V is a build vector of boolean constants and exactly one of those
	/// constants is true, return the operand index of that true element.
	/// Otherwise, return -1.
	static int getOneTrueElt(SDValue V) {
	// This needs to be a build vector of booleans.
	// TODO: Checking for the i1 type matches the IR definition for the mask,
	// but the mask check could be loosened to i8 or other types. That might
	// also require checking more than 'allOnesValue'; eg, the x86 HW
	// instructions only require that the MSB is set for each mask element.
	// The ISD::MSTORE comments/definition do not specify how the mask operand
	// is formatted.
	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| BV->getValueType(0).getVectorElementType() != MVT::i1)
	return -1;

	int TrueIndex = -1;
	unsigned NumElts = BV->getValueType(0).getVectorNumElements();
	for (unsigned i = 0; i < NumElts; ++i) {
	const SDValue &Op = BV->getOperand(i);
	if (Op.isUndef())
	continue;
	auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
	if (!ConstNode)
	return -1;
	if (ConstNode->getAPIntValue().isAllOnesValue()) {
	// If we already found a one, this is too many.
	if (TrueIndex >= 0)
	return -1;
	TrueIndex = i;
	}
	}
	return TrueIndex;
	}

	/// Given a masked memory load/store operation, return true if it has one mask
	/// bit set. If it has one mask bit set, then also return the memory address of
	/// the scalar element to load/store, the vector index to insert/extract that
	/// scalar element, and the alignment for the scalar memory access.
	static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
	SelectionDAG &DAG, SDValue &Addr,
	SDValue &Index, unsigned &Alignment) {
	int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
	if (TrueMaskElt < 0)
	return false;

	// Get the address of the one scalar element that is specified by the mask
	// using the appropriate offset from the base pointer.
	EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
	Addr = MaskedOp->getBasePtr();
	if (TrueMaskElt != 0) {
	unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
	Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
	}

	Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
	Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
	return true;
	}

	/// If exactly one element of the mask is set for a non-extending masked load,
	/// it is a scalar load and vector insert.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue
	reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Load the one scalar element that is specified by the mask using the
	// appropriate offset from the base pointer.
	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDValue Load =
	DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
	Alignment, ML->getMemOperand()->getFlags());

	// Insert the loaded element into the appropriate place in the vector.
	SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
	Load, VecIndex);
	return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
	}

	static SDValue
	combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
	return SDValue();

	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);

	// If we are loading the first and last elements of a vector, it is safe and
	// always faster to load the whole vector. Replace the masked load with a
	// vector load and select.
	unsigned NumElts = VT.getVectorNumElements();
	BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
	bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
	bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
	if (LoadFirstElt && LoadLastElt) {
	SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMemOperand());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
	return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
	}

	// Convert a masked load with a constant mask into a masked load and a select.
	// This allows the select operation to use a faster kind of select instruction
	// (for example, vblendvps -> vblendps).

	// Don't try this if the pass-through operand is already undefined. That would
	// cause an infinite loop because that's what we're about to create.
	if (ML->getSrc0().isUndef())
	return SDValue();

	// The new masked load has an undef pass-through operand. The select uses the
	// original pass-through operand.
	SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMask(), DAG.getUNDEF(VT),
	ML->getMemoryVT(), ML->getMemOperand(),
	ML->getExtensionType());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());

	return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
	}

	static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);

	// TODO: Expanding load with constant mask may be optimized as well.
	if (Mld->isExpandingLoad())
	return SDValue();

	if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
	if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
	return ScalarLoad;
	// TODO: Do some AVX512 subsets benefit from this transform?
	if (!Subtarget.hasAVX512())
	if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
	return Blend;
	}

	if (Mld->getExtensionType() != ISD::SEXTLOAD)
	return SDValue();

	// Resolve extending loads.
	EVT VT = Mld->getValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	EVT LdVT = Mld->getMemoryVT();
	SDLoc dl(Mld);

	assert(LdVT != VT && "Cannot extend to the same type");
	unsigned ToSz = VT.getScalarSizeInBits();
	unsigned FromSz = LdVT.getScalarSizeInBits();
	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for extending masked load");

	unsigned SizeRatio = ToSz / FromSz;
	assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	LdVT.getScalarType(), NumElems*SizeRatio);
	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	// Convert Src0 value.
	SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
	if (!Mld->getSrc0().isUndef()) {
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");
	WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
	DAG.getUNDEF(WideVecVT), ShuffleVec);
	}

	// Prepare the new mask.
	SDValue NewMask;
	SDValue Mask = Mld->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
	ShuffleVec[i] = NumElems * SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
	Ops[0] = Mask;
	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
	Mld->getBasePtr(), NewMask, WideSrc0,
	Mld->getMemoryVT(), Mld->getMemOperand(),
	ISD::NON_EXTLOAD);
	SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
	return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
	}

	/// If exactly one element of the mask is set for a non-truncating masked store,
	/// it is a vector extract and scalar store.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
	SelectionDAG &DAG) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Extract the one scalar element that is actually being stored.
	SDLoc DL(MS);
	EVT VT = MS->getValue().getValueType();
	EVT EltVT = VT.getVectorElementType();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
	MS->getValue(), VecIndex);

	// Store that element at the appropriate offset from the base pointer.
	return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
	Alignment, MS->getMemOperand()->getFlags());
	}

	static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

	if (Mst->isCompressingStore())
	return SDValue();

	if (!Mst->isTruncatingStore()) {
	if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
	return ScalarStore;

	// If the mask is checking (0 > X), we're creating a vector with all-zeros
	// or all-ones elements based on the sign bits of X. AVX1 masked store only
	// cares about the sign bit of each mask element, so eliminate the compare:
	// mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
	// Note that by waiting to match an x86-specific PCMPGT node, we're
	// eliminating potentially more complex matching of a setcc node which has
	// a full range of predicates.
	SDValue Mask = Mst->getMask();
	if (Mask.getOpcode() == X86ISD::PCMPGT &&
	ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
	assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
	"Unexpected type for PCMPGT");
	return DAG.getMaskedStore(
	Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
	Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
	}

	// TODO: AVX512 targets should also be able to simplify something like the
	// pattern above, but that pattern will be different. It will either need to
	// match setcc more generally or match PCMPGTM later (in tablegen?).

	return SDValue();
	}

	// Resolve truncating stores.
	EVT VT = Mst->getValue().getValueType();
	unsigned NumElems = VT.getVectorNumElements();
	EVT StVT = Mst->getMemoryVT();
	SDLoc dl(Mst);

	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegal(VT, StVT))
	return SDValue();

	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for truncating masked store");
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	assert (((NumElems * FromSz) % ToSz) == 0 &&
	"Unexpected ratio for truncating masked store");

	unsigned SizeRatio = FromSz / ToSz;
	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");

	SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);

	SDValue NewMask;
	SDValue Mask = Mst->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
	ShuffleVec[i] = NumElems*SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
	Ops[0] = Mask;
	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
	Mst->getBasePtr(), NewMask, StVT,
	Mst->getMemOperand(), false);
	}

	static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	StoreSDNode *St = cast<StoreSDNode>(N);
	EVT VT = St->getValue().getValueType();
	EVT StVT = St->getMemoryVT();
	SDLoc dl(St);
	SDValue StoredVal = St->getOperand(1);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// If we are saving a concatenation of two XMM registers and 32-byte stores
	// are slow, such as on Sandy Bridge, perform two 16-byte stores.
	bool Fast;
	unsigned AddressSpace = St->getAddressSpace();
	unsigned Alignment = St->getAlignment();
	if (VT.is256BitVector() && StVT == VT &&
	TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	AddressSpace, Alignment, &Fast) &&
	!Fast) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
	SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);

	SDValue Ptr0 = St->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);

	SDValue Ch0 =
	DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
	Alignment, St->getMemOperand()->getFlags());
	SDValue Ch1 =
	DAG.getStore(St->getChain(), dl, Value1, Ptr1,
	St->getPointerInfo().getWithOffset(16),
	MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
	}

	// Optimize trunc store (of multiple scalars) to shuffle and store.
	// First, pack all of the elements in one place. Next, store to memory
	// in fewer chunks.
	if (St->isTruncatingStore() && VT.isVector()) {
	// Check if we can detect an AVG pattern from the truncation. If yes,
	// replace the trunc store by a normal store with the result of X86ISD::AVG
	// instruction.
	if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
	Subtarget, dl))
	return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());

	if (SDValue Val =
	detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
	return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NumElems = VT.getVectorNumElements();
	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
	return SDValue();

	// From, To sizes and ElemCount must be pow of two
	if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	if (0 != (NumElems * FromSz) % ToSz) return SDValue();

	unsigned SizeRatio = FromSz / ToSz;

	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
	SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	if (!TLI.isTypeLegal(WideVecVT))
	return SDValue();

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);
	// At this point all of the data is stored at the bottom of the
	// register. We now need to save it to mem.

	// Find the largest store unit
	MVT StoreType = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
	StoreType = Tp;
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
	(64 <= NumElems * ToSz))
	StoreType = MVT::f64;

	// Bitcast the original vector into a vector of store-size units
	EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
	StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
	assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
	SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = St->getBasePtr();

	// Perform one or more big stores into memory.
	for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
	StoreType, ShuffWide,
	DAG.getIntPtrConstant(i, dl));
	SDValue Ch =
	DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
	Chains.push_back(Ch);
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	}

	// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
	// the FP state in cases where an emms may be missing.
	// A preferable solution to the general problem is to figure out the right
	// places to insert EMMS. This qualifies as a quick hack.

	// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
	if (VT.getSizeInBits() != 64)
	return SDValue();

	const Function &F = DAG.getMachineFunction().getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool F64IsLegal =
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
	if ((VT.isVector() \|\|
	(VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
	isa<LoadSDNode>(St->getValue()) &&
	!cast<LoadSDNode>(St->getValue())->isVolatile() &&
	St->getChain().hasOneUse() && !St->isVolatile()) {
	LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
	SmallVector<SDValue, 8> Ops;

	if (!ISD::isNormalLoad(Ld))
	return SDValue();

	// If this is not the MMX case, i.e. we are just turning i64 load/store
	// into f64 load/store, avoid the transformation if there are multiple
	// uses of the loaded value.
	if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
	return SDValue();

	SDLoc LdDL(Ld);
	SDLoc StDL(N);
	// If we are a 64-bit capable x86, lower to a single movq load/store pair.
	// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
	// pair instead.
	if (Subtarget.is64Bit() \|\| F64IsLegal) {
	MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
	SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
	Ld->getMemOperand());

	// Make sure new load is placed in same chain order.
	DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
	return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
	St->getMemOperand());
	}

	// Otherwise, lower to two pairs of 32-bit loads / stores.
	SDValue LoAddr = Ld->getBasePtr();
	SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);

	SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
	Ld->getPointerInfo().getWithOffset(4),
	MinAlign(Ld->getAlignment(), 4),
	Ld->getMemOperand()->getFlags());
	// Make sure new loads are placed in same chain order.
	DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
	DAG.makeEquivalentMemoryOrdering(Ld, HiLd);

	LoAddr = St->getBasePtr();
	HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);

	SDValue LoSt =
	DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
	St->getPointerInfo().getWithOffset(4),
	MinAlign(St->getAlignment(), 4),
	St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
	}

	// This is similar to the above case, but here we handle a scalar 64-bit
	// integer store that is extracted from a vector on a 32-bit target.
	// If we have SSE2, then we can treat it like a floating-point double
	// to get past legalization. The execution dependencies fixup pass will
	// choose the optimal machine instruction for the store if this really is
	// an integer or v2f32 rather than an f64.
	if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
	St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	SDValue OldExtract = St->getOperand(1);
	SDValue ExtOp0 = OldExtract.getOperand(0);
	unsigned VecSize = ExtOp0.getValueSizeInBits();
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
	SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	BitCast, OldExtract.getOperand(1));
	return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	return SDValue();
	}

	/// Return 'true' if this vector operation is "horizontal"
	/// and return the operands for the horizontal operation in LHS and RHS. A
	/// horizontal operation performs the binary operation on successive elements
	/// of its first operand, then on successive elements of its second operand,
	/// returning the resulting values in a vector. For example, if
	/// A = < float a0, float a1, float a2, float a3 >
	/// and
	/// B = < float b0, float b1, float b2, float b3 >
	/// then the result of doing a horizontal operation on A and B is
	/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
	/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
	/// A horizontal-op B, for some already available A and B, and if so then LHS is
	/// set to A, RHS to B, and the routine returns 'true'.
	/// Note that the binary operation should have the property that if one of the
	/// operands is UNDEF then the result is UNDEF.
	static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
	// Look for the following pattern: if
	// A = < float a0, float a1, float a2, float a3 >
	// B = < float b0, float b1, float b2, float b3 >
	// and
	// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
	// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
	// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
	// which is A horizontal-op B.

	// At least one of the operands should be a vector shuffle.
	if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
	RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	MVT VT = LHS.getSimpleValueType();

	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Unsupported vector type for horizontal add/sub");

	// Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
	// operate independently on 128-bit lanes.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumLanes = VT.getSizeInBits()/128;
	unsigned NumLaneElts = NumElts / NumLanes;
	assert((NumLaneElts % 2 == 0) &&
	"Vector type should have an even number of elements in each lane");
	unsigned HalfLaneElts = NumLaneElts/2;

	// View LHS in the form
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// If LHS is not a shuffle then pretend it is the shuffle
	// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
	// NOTE: in what follows a default initialized SDValue represents an UNDEF of
	// type VT.
	SDValue A, B;
	SmallVector<int, 16> LMask(NumElts);
	if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!LHS.getOperand(0).isUndef())
	A = LHS.getOperand(0);
	if (!LHS.getOperand(1).isUndef())
	B = LHS.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
	std::copy(Mask.begin(), Mask.end(), LMask.begin());
	} else {
	if (!LHS.isUndef())
	A = LHS;
	for (unsigned i = 0; i != NumElts; ++i)
	LMask[i] = i;
	}

	// Likewise, view RHS in the form
	// RHS = VECTOR_SHUFFLE C, D, RMask
	SDValue C, D;
	SmallVector<int, 16> RMask(NumElts);
	if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!RHS.getOperand(0).isUndef())
	C = RHS.getOperand(0);
	if (!RHS.getOperand(1).isUndef())
	D = RHS.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
	std::copy(Mask.begin(), Mask.end(), RMask.begin());
	} else {
	if (!RHS.isUndef())
	C = RHS;
	for (unsigned i = 0; i != NumElts; ++i)
	RMask[i] = i;
	}

	// Check that the shuffles are both shuffling the same vectors.
	if (!(A == C && B == D) && !(A == D && B == C))
	return false;

	// If everything is UNDEF then bail out: it would be better to fold to UNDEF.
	if (!A.getNode() && !B.getNode())
	return false;

	// If A and B occur in reverse order in RHS, then "swap" them (which means
	// rewriting the mask).
	if (A != C)
	ShuffleVectorSDNode::commuteMask(RMask);

	// At this point LHS and RHS are equivalent to
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// RHS = VECTOR_SHUFFLE A, B, RMask
	// Check that the masks correspond to performing a horizontal operation.
	for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
	for (unsigned i = 0; i != NumLaneElts; ++i) {
	int LIdx = LMask[i+l], RIdx = RMask[i+l];

	// Ignore any UNDEF components.
	if (LIdx < 0 \|\| RIdx < 0 \|\|
	(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|
	(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))
	continue;

	// Check that successive elements are being operated on. If not, this is
	// not a horizontal operation.
	unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
	int Index = 2(i%HalfLaneElts) + NumEltsSrc + l;
	if (!(LIdx == Index && RIdx == Index + 1) &&
	!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
	return false;
	}
	}

	LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
	RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
	return true;
	}

	/// Do target-specific dag combines on floating-point adds/subs.
	static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	bool IsFadd = N->getOpcode() == ISD::FADD;
	assert((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode");

	// Try to synthesize horizontal add/sub from adds/subs of shuffles.
	if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasFp256() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
	isHorizontalBinOp(LHS, RHS, IsFadd)) {
	auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
	}
	return SDValue();
	}

	/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
	/// the codegen.
	/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
	static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDLoc &DL) {
	assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
	SDValue Src = N->getOperand(0);
	unsigned Opcode = Src.getOpcode();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();

	auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
	unsigned TruncSizeInBits = VT.getScalarSizeInBits();

	// Repeated operand, so we are only trading one output truncation for
	// one input truncation.
	if (Op0 == Op1)
	return true;

	// See if either operand has been extended from a smaller/equal size to
	// the truncation size, allowing a truncation to combine with the extend.
	unsigned Opcode0 = Op0.getOpcode();
	if ((Opcode0 == ISD::ANY_EXTEND \|\| Opcode0 == ISD::SIGN_EXTEND \|\|
	Opcode0 == ISD::ZERO_EXTEND) &&
	Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	unsigned Opcode1 = Op1.getOpcode();
	if ((Opcode1 == ISD::ANY_EXTEND \|\| Opcode1 == ISD::SIGN_EXTEND \|\|
	Opcode1 == ISD::ZERO_EXTEND) &&
	Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	// See if either operand is a single use constant which can be constant
	// folded.
	SDValue BC0 = peekThroughOneUseBitcasts(Op0);
	SDValue BC1 = peekThroughOneUseBitcasts(Op1);
	return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) \|\|
	ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
	};

	auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
	SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
	};

	// Don't combine if the operation has other uses.
	if (!N->isOnlyUserOf(Src.getNode()))
	return SDValue();

	// Only support vector truncation for now.
	// TODO: i64 scalar math would benefit as well.
	if (!VT.isVector())
	return SDValue();

	// In most cases its only worth pre-truncating if we're only facing the cost
	// of one truncation.
	// i.e. if one of the inputs will constant fold or the input is repeated.
	switch (Opcode) {
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
	IsRepeatedOpOrFreeTruncation(Op0, Op1))
	return TruncateArithmetic(Op0, Op1);
	break;
	}

	case ISD::MUL:
	// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
	// better to truncate if we have the chance.
	if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
	!Subtarget.hasDQI())
	return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
	LLVM_FALLTHROUGH;
	case ISD::ADD: {
	// TODO: ISD::SUB should be here but interferes with combineSubToSubus.
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegal(Opcode, VT) &&
	IsRepeatedOpOrFreeTruncation(Op0, Op1))
	return TruncateArithmetic(Op0, Op1);
	break;
	}
	}

	return SDValue();
	}

	/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
	static SDValue
	combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
	SmallVector<SDValue, 8> &Regs) {
	assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 \|\|
	Regs[0].getValueType() == MVT::v2i64));
	EVT OutVT = N->getValueType(0);
	EVT OutSVT = OutVT.getVectorElementType();
	EVT InVT = Regs[0].getValueType();
	EVT InSVT = InVT.getVectorElementType();
	SDLoc DL(N);

	// First, use mask to unset all bits that won't appear in the result.
	assert((OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) &&
	"OutSVT can only be either i8 or i16.");
	APInt Mask =
	APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
	SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
	for (auto &Reg : Regs)
	Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);

	MVT UnpackedVT, PackedVT;
	if (OutSVT == MVT::i8) {
	UnpackedVT = MVT::v8i16;
	PackedVT = MVT::v16i8;
	} else {
	UnpackedVT = MVT::v4i32;
	PackedVT = MVT::v8i16;
	}

	// In each iteration, truncate the type by a half size.
	auto RegNum = Regs.size();
	for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
	j < e; j *= 2, RegNum /= 2) {
	for (unsigned i = 0; i < RegNum; i++)
	Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
	for (unsigned i = 0; i < RegNum / 2; i++)
	Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
	Regs[i * 2 + 1]);
	}

	// If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
	// then extract a subvector as the result since v8i8 is not a legal type.
	if (OutVT == MVT::v8i8) {
	Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
	Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
	DAG.getIntPtrConstant(0, DL));
	return Regs[0];
	} else if (RegNum > 1) {
	Regs.resize(RegNum);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
	} else
	return Regs[0];
	}

	/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
	static SDValue
	combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
	SelectionDAG &DAG,
	SmallVector<SDValue, 8> &Regs) {
	assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
	EVT OutVT = N->getValueType(0);
	SDLoc DL(N);

	// Shift left by 16 bits, then arithmetic-shift right by 16 bits.
	SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
	for (auto &Reg : Regs) {
	Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
	Subtarget, DAG);
	Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
	Subtarget, DAG);
	}

	for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
	Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
	Regs[i * 2 + 1]);

	if (Regs.size() > 2) {
	Regs.resize(Regs.size() / 2);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
	} else
	return Regs[0];
	}

	/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
	/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
	/// legalization the truncation will be translated into a BUILD_VECTOR with each
	/// element that is extracted from a vector and then truncated, and it is
	/// difficult to do this optimization based on them.
	static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OutVT = N->getValueType(0);
	if (!OutVT.isVector())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	EVT InVT = In.getValueType();
	unsigned NumElems = OutVT.getVectorNumElements();

	// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
	// SSE2, and we need to take care of it specially.
	// AVX512 provides vpmovdb.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX2())
	return SDValue();

	EVT OutSVT = OutVT.getVectorElementType();
	EVT InSVT = InVT.getVectorElementType();
	if (!((InSVT == MVT::i32 \|\| InSVT == MVT::i64) &&
	(OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
	NumElems >= 8))
	return SDValue();

	// SSSE3's pshufb results in less instructions in the cases below.
	if (Subtarget.hasSSSE3() && NumElems == 8 &&
	((OutSVT == MVT::i8 && InSVT != MVT::i64) \|\|
	(InSVT == MVT::i32 && OutSVT == MVT::i16)))
	return SDValue();

	SDLoc DL(N);

	// Split a long vector into vectors of legal type.
	unsigned RegNum = InVT.getSizeInBits() / 128;
	SmallVector<SDValue, 8> SubVec(RegNum);
	unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
	EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);

	for (unsigned i = 0; i < RegNum; i++)
	SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
	DAG.getIntPtrConstant(i * NumSubRegElts, DL));

	// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
	// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
	// truncate 2 x v4i32 to v8i16.
	if (Subtarget.hasSSE41() \|\| OutSVT == MVT::i8)
	return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
	else if (InSVT == MVT::i32)
	return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
	else
	return SDValue();
	}

	/// This function transforms vector truncation of 'extended sign-bits' or
	/// 'extended zero-bits' values.
	/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
	static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Requires SSE2 but AVX512 has fast truncate.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	if (!N->getValueType(0).isVector() \|\| !N->getValueType(0).isSimple())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	MVT VT = N->getValueType(0).getSimpleVT();
	MVT SVT = VT.getScalarType();

	MVT InVT = In.getValueType().getSimpleVT();
	MVT InSVT = InVT.getScalarType();

	// Check we have a truncation suited for PACKSS.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
	return SDValue();
	if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
	return SDValue();

	// Use PACKSS if the input has sign-bits that extend all the way to the
	// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
	unsigned NumSignBits = DAG.ComputeNumSignBits(In);
	unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
	if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
	return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

	// Use PACKUS if the input has zero-bits that extend all the way to the
	// packed/truncated value. e.g. masks, zext_in_reg, etc.
	KnownBits Known;
	DAG.computeKnownBits(In, Known);
	unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
	NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
	if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

	return SDValue();
	}

	static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// Attempt to pre-truncate inputs to arithmetic ops instead.
	if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
	return V;

	// Try to detect AVG pattern first.
	if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
	return Avg;

	// Try to combine truncation with unsigned saturation.
	if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
	return Val;

	// The bitcast source is a direct mmx result.
	// Detect bitcasts between i32 to x86mmx
	if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
	SDValue BCSrc = Src.getOperand(0);
	if (BCSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
	}

	// Try to truncate extended sign/zero bits with PACKSS/PACKUS.
	if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
	return V;

	return combineVectorTruncation(N, DAG, Subtarget);
	}

	/// Returns the negated value if the node \p N flips sign of FP value.
	///
	/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
	/// AVX512F does not have FXOR, so FNEG is lowered as
	/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
	/// In this case we go though all bitcasts.
	static SDValue isFNEG(SDNode *N) {
	if (N->getOpcode() == ISD::FNEG)
	return N->getOperand(0);

	SDValue Op = peekThroughBitcasts(SDValue(N, 0));
	if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
	return SDValue();

	SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
	if (!Op1.getValueType().isFloatingPoint())
	return SDValue();

	SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));

	unsigned EltBits = Op1.getScalarValueSizeInBits();
	auto isSignMask = [&](const ConstantFP *C) {
	return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
	};

	// There is more than one way to represent the same constant on
	// the different X86 targets. The type of the node may also depend on size.
	// - load scalar value and broadcast
	// - BUILD_VECTOR node
	// - load from a constant pool.
	// We check all variants here.
	if (Op1.getOpcode() == X86ISD::VBROADCAST) {
	if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
	if (isSignMask(cast<ConstantFP>(C)))
	return Op0;

	} else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
	if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
	if (isSignMask(CN->getConstantFPValue()))
	return Op0;

	} else if (auto *C = getTargetConstantFromNode(Op1)) {
	if (C->getType()->isVectorTy()) {
	if (auto *SplatV = C->getSplatValue())
	if (isSignMask(cast<ConstantFP>(SplatV)))
	return Op0;
	} else if (auto *FPConst = dyn_cast<ConstantFP>(C))
	if (isSignMask(FPConst))
	return Op0;
	}
	return SDValue();
	}

	/// Do target-specific dag combines on floating point negations.
	static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OrigVT = N->getValueType(0);
	SDValue Arg = isFNEG(N);
	assert(Arg.getNode() && "N is expected to be an FNEG node");

	EVT VT = Arg.getValueType();
	EVT SVT = VT.getScalarType();
	SDLoc DL(N);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// If we're negating a FMUL node on a target with FMA, then we can avoid the
	// use of a constant by performing (-0 - A*B) instead.
	// FIXME: Check rounding control flags as well once it becomes available.
	if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 \|\| SVT == MVT::f64) &&
	Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
	SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
	SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
	Arg.getOperand(1), Zero);
	return DAG.getBitcast(OrigVT, NewNode);
	}

	// If we're negating an FMA node, then we can adjust the
	// instruction to include the extra negation.
	unsigned NewOpcode = 0;
	if (Arg.hasOneUse()) {
	switch (Arg.getOpcode()) {
	case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
	case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
	case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
	// We can't handle scalar intrinsic node here because it would only
	// invert one element and not the whole vector. But we could try to handle
	// a negation of the lower element only.
	}
	}
	if (NewOpcode)
	return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
	Arg.getNode()->ops()));

	return SDValue();
	}

	static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	// If we have integer vector types available, use the integer opcodes.
	if (VT.isVector() && Subtarget.hasSSE2()) {
	SDLoc dl(N);

	MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

	SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
	SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
	unsigned IntOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected FP logic op");
	case X86ISD::FOR: IntOpcode = ISD::OR; break;
	case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
	case X86ISD::FAND: IntOpcode = ISD::AND; break;
	case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
	}
	SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
	return DAG.getBitcast(VT, IntOp);
	}
	return SDValue();
	}


	/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
	static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() != ISD::XOR)
	return SDValue();

	SDValue LHS = N->getOperand(0);
	auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!RHSC \|\| RHSC->getZExtValue() != 1 \|\| LHS->getOpcode() != X86ISD::SETCC)
	return SDValue();

	X86::CondCode NewCC = X86::GetOppositeBranchCondition(
	X86::CondCode(LHS->getConstantOperandVal(0)));
	SDLoc DL(N);
	return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
	}

	static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// If this is SSE1 only convert to FXOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
	N->getValueType(0) == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue SetCC = foldXor1SetCC(N, DAG))
	return SetCC;

	if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
	return RV;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (isFNEG(N))
	return combineFneg(N, DAG, Subtarget);
	return SDValue();
	}


	static bool isNullFPScalarOrVectorConst(SDValue V) {
	return isNullFPConstant(V) \|\| ISD::isBuildVectorAllZeros(V.getNode());
	}

	/// If a value is a scalar FP zero or a vector FP zero (potentially including
	/// undefined elements), return a zero constant that may be used to fold away
	/// that value. In the case of a vector, the returned constant will not contain
	/// undefined elements even if the input parameter does. This makes it suitable
	/// to be used as a replacement operand with operations (eg, bitwise-and) where
	/// an undef should not propagate.
	static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!isNullFPScalarOrVectorConst(V))
	return SDValue();

	if (V.getValueType().isVector())
	return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

	return V;
	}

	static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
	if (!((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::f64 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
	return SDValue();

	auto isAllOnesConstantFP = [](SDValue V) {
	if (V.getSimpleValueType().isVector())
	return ISD::isBuildVectorAllOnes(V.getNode());
	auto *C = dyn_cast<ConstantFPSDNode>(V);
	return C && C->getConstantFPValue()->isAllOnesValue();
	};

	// fand (fxor X, -1), Y --> fandn X, Y
	if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

	// fand X, (fxor Y, -1) --> fandn Y, X
	if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::FAND nodes.
	static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FAND(0.0, x) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
	return V;

	// FAND(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FANDN nodes.
	static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FANDN(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// FANDN(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
	static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);

	// F[X]OR(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// F[X]OR(x, 0.0) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(1)))
	return N->getOperand(0);

	if (isFNEG(N))
	if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
	return NewVal;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
	static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX);

	// Only perform optimizations if UnsafeMath is used.
	if (!DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
	// into FMINC and FMAXC, which are Commutative operations.
	unsigned NewOp = 0;
	switch (N->getOpcode()) {
	default: llvm_unreachable("unknown opcode");
	case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
	case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
	}

	return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	}

	static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Subtarget.useSoftFloat())
	return SDValue();

	// TODO: Check for global or instruction-level "nnan". In that case, we
	// should be able to lower to FMAX/FMIN alone.
	// TODO: If an operand is already known to be a NaN or not a NaN, this
	// should be an optional swap and FMAX/FMIN.

	EVT VT = N->getValueType(0);
	if (!((Subtarget.hasSSE1() && (VT == MVT::f32 \|\| VT == MVT::v4f32)) \|\|
	(Subtarget.hasSSE2() && (VT == MVT::f64 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasAVX() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))))
	return SDValue();

	// This takes at least 3 instructions, so favor a library call when operating
	// on a scalar and minimizing code size.
	if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDLoc DL(N);
	EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
	DAG.getDataLayout(), *DAG.getContext(), VT);

	// There are 4 possibilities involving NaN inputs, and these are the required
	// outputs:
	// Op1
	// Num NaN
	// ----------------
	// Num \| Max \| Op0 \|
	// Op0 ----------------
	// NaN \| Op1 \| NaN \|
	// ----------------
	//
	// The SSE FP max/min instructions were not designed for this case, but rather
	// to implement:
	// Min = Op1 < Op0 ? Op1 : Op0
	// Max = Op1 > Op0 ? Op1 : Op0
	//
	// So they always return Op0 if either input is a NaN. However, we can still
	// use those instructions for fmaxnum by selecting away a NaN input.

	// If either operand is NaN, the 2nd source operand (Op0) is passed through.
	auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
	SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
	SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);

	// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
	// are NaN, the NaN value of Op1 is the result.
	return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
	}

	/// Do target-specific dag combines on X86ISD::ANDNP nodes.
	static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// ANDNP(0, x) -> x
	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
	return N->getOperand(1);

	// ANDNP(x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
	return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));

	EVT VT = N->getValueType(0);

	// Attempt to recursively combine a bitmask ANDNP with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}
	}

	return SDValue();
	}

	static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// BT ignores high bits in the bit index operand.
	unsigned BitWidth = N1.getValueSizeInBits();
	APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
	if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
	return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);

	return SDValue();
	}

	static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isVector())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
	SDLoc dl(N);

	// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
	// both SSE and AVX2 since there is no sign-extended shift right
	// operation on a vector with 64-bit elements.
	//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
	// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
	if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND)) {
	SDValue N00 = N0.getOperand(0);

	// EXTLOAD has a better solution on AVX2,
	// it may be replaced with X86ISD::VSEXT node.
	if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
	if (!ISD::isNormalLoad(N00.getNode()))
	return SDValue();

	if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
	SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
	N00, N1);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
	}
	}
	return SDValue();
	}

	/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
	/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
	/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
	/// opportunities to combine math ops, use an LEA, or use a complex addressing
	/// mode. This can eliminate extend, add, and shift instructions.
	static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
	Ext->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();

	// TODO: This should be valid for other integer types.
	EVT VT = Ext->getValueType(0);
	if (VT != MVT::i64)
	return SDValue();

	SDValue Add = Ext->getOperand(0);
	if (Add.getOpcode() != ISD::ADD)
	return SDValue();

	bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
	bool NSW = Add->getFlags().hasNoSignedWrap();
	bool NUW = Add->getFlags().hasNoUnsignedWrap();

	// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
	// into the 'zext'
	if ((Sext && !NSW) \|\| (!Sext && !NUW))
	return SDValue();

	// Having a constant operand to the 'add' ensures that we are not increasing
	// the instruction count because the constant is extended for free below.
	// A constant operand can also become the displacement field of an LEA.
	auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
	if (!AddOp1)
	return SDValue();

	// Don't make the 'add' bigger if there's no hope of combining it with some
	// other 'add' or 'shl' instruction.
	// TODO: It may be profitable to generate simpler LEA instructions in place
	// of single 'add' instructions, but the cost model for selecting an LEA
	// currently has a high threshold.
	bool HasLEAPotential = false;
	for (auto *User : Ext->uses()) {
	if (User->getOpcode() == ISD::ADD \|\| User->getOpcode() == ISD::SHL) {
	HasLEAPotential = true;
	break;
	}
	}
	if (!HasLEAPotential)
	return SDValue();

	// Everything looks good, so pull the '{s\|z}ext' ahead of the 'add'.
	int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
	SDValue AddOp0 = Add.getOperand(0);
	SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
	SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

	// The wider add is guaranteed to not wrap because both operands are
	// sign-extended.
	SDNodeFlags Flags;
	Flags.setNoSignedWrap(NSW);
	Flags.setNoUnsignedWrap(NUW);
	return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
	}

	/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
	/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
	/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
	/// extends from AH (which we otherwise need to do contortions to access).
	static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	auto OpcodeN = N->getOpcode();
	auto OpcodeN0 = N0.getOpcode();
	if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) \|\|
	(OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
	return SDValue();

	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	if (N0.getResNo() != 1 \|\| InVT != MVT::i8 \|\|
	!(VT == MVT::i32 \|\| VT == MVT::i64))
	return SDValue();

	SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
	auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
	: X86ISD::UDIVREM8_ZEXT_HREG;
	SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
	N0.getOperand(1));
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
	// If this was a 64-bit extend, complete it.
	if (VT == MVT::i64)
	return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
	return R.getValue(1);
	}

	// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
	// operands and the result of CMOV is not used anywhere else - promote CMOV
	// itself instead of promoting its result. This could be beneficial, because:
	// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
	// (or more) pseudo-CMOVs only when they go one-after-another and
	// getting rid of result extension code after CMOV will help that.
	// 2) Promotion of constant CMOV arguments is free, hence the
	// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
	// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
	// promotion is also good in terms of code-size.
	// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
	// promotion).
	static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
	SDValue CMovN = Extend->getOperand(0);
	if (CMovN.getOpcode() != X86ISD::CMOV)
	return SDValue();

	EVT TargetVT = Extend->getValueType(0);
	unsigned ExtendOpcode = Extend->getOpcode();
	SDLoc DL(Extend);

	EVT VT = CMovN.getValueType();
	SDValue CMovOp0 = CMovN.getOperand(0);
	SDValue CMovOp1 = CMovN.getOperand(1);

	bool DoPromoteCMOV =
	(VT == MVT::i16 && (TargetVT == MVT::i32 \|\| TargetVT == MVT::i64)) &&
	CMovN.hasOneUse() &&
	(isa<ConstantSDNode>(CMovOp0.getNode()) &&
	isa<ConstantSDNode>(CMovOp1.getNode()));

	if (!DoPromoteCMOV)
	return SDValue();

	CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
	CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);

	return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
	CMovN.getOperand(2), CMovN.getOperand(3));
	}

	// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
	// This is more or less the reverse of combineBitcastvxi1.
	static SDValue
	combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
	Opcode != ISD::ANY_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InSVT = N0.getValueType().getScalarType();
	unsigned EltSizeInBits = SVT.getSizeInBits();

	// Input type must be extending a bool vector (bit-casted from a scalar
	// integer) to legal integer types.
	if (!VT.isVector())
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
	return SDValue();
	if (InSVT != MVT::i1 \|\| N0.getOpcode() != ISD::BITCAST)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	EVT SclVT = N0.getOperand(0).getValueType();
	if (!SclVT.isScalarInteger())
	return SDValue();

	SDLoc DL(N);
	SDValue Vec;
	SmallVector<int, 32> ShuffleMask;
	unsigned NumElts = VT.getVectorNumElements();
	assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");

	// Broadcast the scalar integer to the vector elements.
	if (NumElts > EltSizeInBits) {
	// If the scalar integer is greater than the vector element size, then we
	// must split it down into sub-sections for broadcasting. For example:
	// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
	// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
	assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
	unsigned Scale = NumElts / EltSizeInBits;
	EVT BroadcastVT =
	EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
	Vec = DAG.getBitcast(VT, Vec);

	for (unsigned i = 0; i != Scale; ++i)
	ShuffleMask.append(EltSizeInBits, i);
	} else {
	// For smaller scalar integers, we can simply any-extend it to the vector
	// element size (we don't care about the upper bits) and broadcast it to all
	// elements.
	SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
	ShuffleMask.append(NumElts, 0);
	}
	Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

	// Now, mask the relevant bit in each element.
	SmallVector<SDValue, 32> Bits;
	for (unsigned i = 0; i != NumElts; ++i) {
	int BitIdx = (i % EltSizeInBits);
	APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
	Bits.push_back(DAG.getConstant(Bit, DL, SVT));
	}
	SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
	Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

	// Compare against the bitmask and extend the result.
	EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
	Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
	Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

	// For SEXT, this is now done, otherwise shift the result down for
	// zero-extension.
	if (Opcode == ISD::SIGN_EXTEND)
	return Vec;
	return DAG.getNode(ISD::SRL, DL, VT, Vec,
	DAG.getConstant(EltSizeInBits - 1, DL, VT));
	}

	/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
	/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
	/// with UNDEFs) of the input to vectors of the same size as the target type
	/// which then extends the lowest elements.
	static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InVT = N0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// Input type must be a vector and we must be extending legal integer types.
	if (!VT.isVector())
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();

	// On AVX2+ targets, if the input/output types are both legal then we will be
	// able to use SIGN_EXTEND/ZERO_EXTEND directly.
	if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	DAG.getTargetLoweringInfo().isTypeLegal(InVT))
	return SDValue();

	SDLoc DL(N);

	auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
	EVT InVT = N.getValueType();
	EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
	Size / InVT.getScalarSizeInBits());
	SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
	DAG.getUNDEF(InVT));
	Opnds[0] = N;
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
	};

	// If target-size is less than 128-bits, extend to a type that would extend
	// to 128 bits, extend that and extract the original target vector.
	if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
	unsigned Scale = 128 / VT.getSizeInBits();
	EVT ExVT =
	EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
	SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
	SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
	DAG.getIntPtrConstant(0, DL));
	}

	// If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
	// ISD::_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::VEXT.
	// Also use this if we don't have SSE41 to allow the legalizer do its job.
	if (!Subtarget.hasSSE41() \|\| VT.is128BitVector() \|\|
	(VT.is256BitVector() && Subtarget.hasInt256()) \|\|
	(VT.is512BitVector() && Subtarget.hasAVX512())) {
	SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
	return Opcode == ISD::SIGN_EXTEND
	? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
	: DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
	}

	auto SplitAndExtendInReg = [&](unsigned SplitSize) {
	unsigned NumVecs = VT.getSizeInBits() / SplitSize;
	unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
	EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
	EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);

	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
	SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
	DAG.getIntPtrConstant(Offset, DL));
	SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
	SrcVec = Opcode == ISD::SIGN_EXTEND
	? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
	: DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
	Opnds.push_back(SrcVec);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
	};

	// On pre-AVX2 targets, split into 128-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
	return SplitAndExtendInReg(128);

	// On pre-AVX512 targets, split into 256-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
	return SplitAndExtendInReg(256);

	return SDValue();
	}

	static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	SDLoc DL(N);

	if (SDValue DivRem8 = getDivRem8(N, DAG))
	return DivRem8;

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
	isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
	// Invert and sign-extend a boolean is the same as zero-extend and subtract
	// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
	// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
	// sext (xor Bool, -1) --> sub (zext Bool), 1
	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
	}

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	return SDValue();
	}

	static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	EVT ScalarVT = VT.getScalarType();
	if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) \|\| !Subtarget.hasAnyFMA())
	return SDValue();

	SDValue A = N->getOperand(0);
	SDValue B = N->getOperand(1);
	SDValue C = N->getOperand(2);

	auto invertIfNegative = [](SDValue &V) {
	if (SDValue NegVal = isFNEG(V.getNode())) {
	V = NegVal;
	return true;
	}
	return false;
	};

	// Do not convert the passthru input of scalar intrinsics.
	// FIXME: We could allow negations of the lower element only.
	bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
	N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
	bool NegB = invertIfNegative(B);
	bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
	N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);

	// Negative multiplication when NegA xor NegB
	bool NegMul = (NegA != NegB);
	bool HasNeg = NegA \|\| NegB \|\| NegC;

	unsigned NewOpcode;
	if (!NegMul)
	NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
	else
	NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;

	// For FMA, we risk reconstructing the node we started with.
	// In order to avoid this, we check for negation or opcode change. If
	// one of the two happened, then it is a new node and we return it.
	if (N->getOpcode() == ISD::FMA) {
	if (HasNeg \|\| NewOpcode != N->getOpcode())
	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
	return SDValue();
	}

	if (N->getOpcode() == X86ISD::FMADD_RND) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS1) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS3) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADD4S) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
	}
	} else {
	llvm_unreachable("Unexpected opcode!");
	}

	// Only return the node is the opcode was changed or one of the
	// operand was negated. If not, we'll just recreate the same node.
	if (HasNeg \|\| NewOpcode != N->getOpcode()) {
	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
	}

	return SDValue();
	}

	// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
	static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	SDValue NegVal = isFNEG(N->getOperand(2).getNode());
	if (!NegVal)
	return SDValue();

	unsigned NewOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
	case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
	case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
	case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
	}

	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegVal, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegVal);
	}

	static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
	// (and (i32 x86isd::setcc_carry), 1)
	// This eliminates the zext. This transformation is necessary because
	// ISD::SETCC is always legalized to i8.
	SDLoc dl(N);
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.getOpcode() == ISD::AND &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	if (!isOneConstant(N0.getOperand(1)))
	return SDValue();
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (N0.getOpcode() == ISD::TRUNCATE &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue DivRem8 = getDivRem8(N, DAG))
	return DivRem8;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
	return R;

	return SDValue();
	}

	/// Try to map a 128-bit or larger integer comparison to vector instructions
	/// before type legalization splits it up into chunks.
	static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
	assert((CC == ISD::SETNE \|\| CC == ISD::SETEQ) && "Bad comparison predicate");

	// We're looking for an oversized integer equality comparison.
	SDValue X = SetCC->getOperand(0);
	SDValue Y = SetCC->getOperand(1);
	EVT OpVT = X.getValueType();
	unsigned OpSize = OpVT.getSizeInBits();
	if (!OpVT.isScalarInteger() \|\| OpSize < 128)
	return SDValue();

	// Ignore a comparison with zero because that gets special treatment in
	// EmitTest(). But make an exception for the special case of a pair of
	// logically-combined vector-sized operands compared to zero. This pattern may
	// be generated by the memcmp expansion pass with oversized integer compares
	// (see PR33325).
	bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
	X.getOperand(0).getOpcode() == ISD::XOR &&
	X.getOperand(1).getOpcode() == ISD::XOR;
	if (isNullConstant(Y) && !IsOrXorXorCCZero)
	return SDValue();

	// Bail out if we know that this is not really just an oversized integer.
	if (peekThroughBitcasts(X).getValueType() == MVT::f128 \|\|
	peekThroughBitcasts(Y).getValueType() == MVT::f128)
	return SDValue();

	// TODO: Use PXOR + PTEST for SSE4.1 or later?
	// TODO: Add support for AVX-512.
	EVT VT = SetCC->getValueType(0);
	SDLoc DL(SetCC);
	if ((OpSize == 128 && Subtarget.hasSSE2()) \|\|
	(OpSize == 256 && Subtarget.hasAVX2())) {
	EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
	SDValue Cmp;
	if (IsOrXorXorCCZero) {
	// This is a bitwise-combined equality comparison of 2 pairs of vectors:
	// setcc i128 (or (xor A, B), (xor C, D)), 0, eq\|ne
	// Use 2 vector equality compares and 'and' the results before doing a
	// MOVMSK.
	SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
	SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
	SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
	SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
	SDValue Cmp1 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, A, B);
	SDValue Cmp2 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, C, D);
	Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
	} else {
	SDValue VecX = DAG.getBitcast(VecVT, X);
	SDValue VecY = DAG.getBitcast(VecVT, Y);
	Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
	}
	// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
	// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
	// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
	// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
	// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
	SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
	SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
	MVT::i32);
	return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
	}

	return SDValue();
	}

	static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (CC == ISD::SETNE \|\| CC == ISD::SETEQ) {
	EVT OpVT = LHS.getValueType();
	// 0-x == y --> x+y == 0
	// 0-x != y --> x+y != 0
	if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
	LHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}
	// x == 0-y --> x+y == 0
	// x != 0-y --> x+y != 0
	if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
	RHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}

	if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
	return V;
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	(CC == ISD::SETNE \|\| CC == ISD::SETEQ \|\| ISD::isSignedIntSetCC(CC))) {
	// Put build_vectors on the right.
	if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
	std::swap(LHS, RHS);
	CC = ISD::getSetCCSwappedOperands(CC);
	}

	bool IsSEXT0 =
	(LHS.getOpcode() == ISD::SIGN_EXTEND) &&
	(LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
	bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());

	if (IsSEXT0 && IsVZero1) {
	assert(VT == LHS.getOperand(0).getValueType() &&
	"Uexpected operand type");
	if (CC == ISD::SETGT)
	return DAG.getConstant(0, DL, VT);
	if (CC == ISD::SETLE)
	return DAG.getConstant(1, DL, VT);
	if (CC == ISD::SETEQ \|\| CC == ISD::SETGE)
	return DAG.getNOT(DL, LHS.getOperand(0), VT);

	assert((CC == ISD::SETNE \|\| CC == ISD::SETLT) &&
	"Unexpected condition code!");
	return LHS.getOperand(0);
	}
	}

	// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
	// to avoid scalarization via legalization because v4i32 is not a legal type.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
	LHS.getValueType() == MVT::v4f32)
	return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

	return SDValue();
	}

	static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue Src = N->getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());

	// MOVMSK only uses the MSB from each vector element.
	KnownBits Known;
	APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
	if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
	DCI.AddToWorklist(Src.getNode());
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue(N, 0);
	}

	return SDValue();
	}

	static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	if (DCI.isBeforeLegalizeOps()) {
	SDValue Index = N->getOperand(4);
	// Remove any sign extends from 32 or smaller to larger than 32.
	// Only do this before LegalizeOps in case we need the sign extend for
	// legalization.
	if (Index.getOpcode() == ISD::SIGN_EXTEND) {
	if (Index.getScalarValueSizeInBits() > 32 &&
	Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index.getOperand(0);
	DAG.UpdateNodeOperands(N, NewOps);
	// The original sign extend has less users, add back to worklist in case
	// it needs to be removed
	DCI.AddToWorklist(Index.getNode());
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	}

	// Make sure the index is either i32 or i64
	unsigned ScalarSize = Index.getScalarValueSizeInBits();
	if (ScalarSize != 32 && ScalarSize != 64) {
	MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
	EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	Index.getValueType().getVectorNumElements());
	Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index;
	DAG.UpdateNodeOperands(N, NewOps);
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}

	// Try to remove zero extends from 32->64 if we know the sign bit of
	// the input is zero.
	if (Index.getOpcode() == ISD::ZERO_EXTEND &&
	Index.getScalarValueSizeInBits() == 64 &&
	Index.getOperand(0).getScalarValueSizeInBits() == 32) {
	if (DAG.SignBitIsZero(Index.getOperand(0))) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index.getOperand(0);
	DAG.UpdateNodeOperands(N, NewOps);
	// The original zero extend has less users, add back to worklist in case
	// it needs to be removed
	DCI.AddToWorklist(Index.getNode());
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	}
	}

	// Gather and Scatter instructions use k-registers for masks. The type of
	// the masks is v*i1. So the mask will be truncated anyway.
	// The SIGN_EXTEND_INREG my be dropped.
	SDValue Mask = N->getOperand(2);
	if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[2] = Mask.getOperand(0);
	DAG.UpdateNodeOperands(N, NewOps);
	return SDValue(N, 0);
	}

	// With AVX2 we only demand the upper bit of the mask.
	if (!Subtarget.hasAVX512()) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	KnownBits Known;
	APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
	DCI.AddToWorklist(Mask.getNode());
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
	static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
	SDValue EFLAGS = N->getOperand(1);

	// Try to simplify the EFLAGS and condition code operands.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
	return getSETCC(CC, Flags, DL, DAG);

	return SDValue();
	}

	/// Optimize branch condition evaluation.
	static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue EFLAGS = N->getOperand(3);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

	// Try to simplify the EFLAGS and condition code operands.
	// Make sure to not keep references to operands, as combineSetCCEFLAGS can
	// RAUW them under us.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
	SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
	N->getOperand(1), Cond, Flags);
	}

	return SDValue();
	}

	static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (BuildVectorSDNode *BV =
	dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
	N->getOperand(0)->getOperand(0), MaskConst);
	SDValue Res = DAG.getBitcast(VT, NewAnd);
	return Res;
	}

	return SDValue();
	}

	static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
	// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
	if (InVT.isVector() && (InSVT == MVT::i8 \|\| InSVT == MVT::i16)) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

	// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
	// the optimization here.
	if (DAG.SignBitIsZero(Op0))
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

	return SDValue();
	}

	static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// First try to optimize away the conversion entirely when it's
	// conditionally from a constant. Vectors only.
	if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
	return Res;

	// Now move on to more general possibilities.
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
	// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
	// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
	if (InVT.isVector() &&
	(InSVT == MVT::i8 \|\| InSVT == MVT::i16 \|\|
	(InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Without AVX512DQ we only support i64 to float scalar conversion. For both
	// vectors and scalars, see if we know that the upper bits are all the sign
	// bit, in which case we can truncate the input to i32 and convert from that.
	if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
	unsigned BitWidth = InVT.getScalarSizeInBits();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
	if (NumSignBits >= (BitWidth - 31)) {
	EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
	if (InVT.isVector())
	TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
	InVT.getVectorNumElements());
	SDLoc dl(N);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
	}
	}

	// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
	// a 32-bit target where SSE doesn't support i64->FP operations.
	if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
	LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
	EVT LdVT = Ld->getValueType(0);

	// This transformation is not supported if the result type is f16 or f128.
	if (VT == MVT::f16 \|\| VT == MVT::f128)
	return SDValue();

	if (!Ld->isVolatile() && !VT.isVector() &&
	ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
	!Subtarget.is64Bit() && LdVT == MVT::i64) {
	SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
	SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
	DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
	return FILDChain;
	}
	}
	return SDValue();
	}

	static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	return SDValue();
	}

	// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
	static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// If the LHS and RHS of the ADC node are zero, then it can't overflow and
	// the result is either zero or one (depending on the input carry bit).
	// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
	if (X86::isZeroNode(N->getOperand(0)) &&
	X86::isZeroNode(N->getOperand(1)) &&
	// We don't have a good way to replace an EFLAGS use, so only do this when
	// dead right now.
	SDValue(N, 1).use_empty()) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
	SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL,
	MVT::i8),
	N->getOperand(2)),
	DAG.getConstant(1, DL, VT));
	return DCI.CombineTo(N, Res1, CarryOut);
	}

	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	return SDValue();
	}

	/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
	/// which is more useful than 0/1 in some cases.
	static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
	SDLoc DL(N);
	// "Condition code B" is also known as "the carry flag" (CF).
	SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
	SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
	MVT VT = N->getSimpleValueType(0);
	if (VT == MVT::i8)
	return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));

	assert(VT == MVT::i1 && "Unexpected type for SETCC node");
	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
	}

	/// If this is an add or subtract where one operand is produced by a cmp+setcc,
	/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
	/// with CMP+{ADC, SBB}.
	static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
	bool IsSub = N->getOpcode() == ISD::SUB;
	SDValue X = N->getOperand(0);
	SDValue Y = N->getOperand(1);

	// If this is an add, canonicalize a zext operand to the RHS.
	// TODO: Incomplete? What if both sides are zexts?
	if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
	Y.getOpcode() != ISD::ZERO_EXTEND)
	std::swap(X, Y);

	// Look through a one-use zext.
	bool PeekedThroughZext = false;
	if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
	Y = Y.getOperand(0);
	PeekedThroughZext = true;
	}

	// If this is an add, canonicalize a setcc operand to the RHS.
	// TODO: Incomplete? What if both sides are setcc?
	// TODO: Should we allow peeking through a zext of the other operand?
	if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
	Y.getOpcode() != X86ISD::SETCC)
	std::swap(X, Y);

	if (Y.getOpcode() != X86ISD::SETCC \|\| !Y.hasOneUse())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	auto *ConstantX = dyn_cast<ConstantSDNode>(X);
	if (ConstantX) {
	if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
	// This is a complicated way to get -1 or 0 from the carry flag:
	// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Y.getOperand(1));
	}

	if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
	SDValue EFLAGS = Y->getOperand(1);
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	// Swap the operands of a SUB, and we have the same pattern as above.
	// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
	// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
	SDValue NewSub = DAG.getNode(
	X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	NewEFLAGS);
	}
	}
	}

	if (CC == X86::COND_B) {
	// X + SETB Z --> X + (mask SBB Z, Z)
	// X - SETB Z --> X - (mask SBB Z, Z)
	// TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
	SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
	if (SBB.getValueSizeInBits() != VT.getSizeInBits())
	SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
	return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
	}

	if (CC == X86::COND_A) {
	SDValue EFLAGS = Y->getOperand(1);
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp instruction
	// cannot take an immediate as its first operand.
	//
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
	EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
	if (SBB.getValueSizeInBits() != VT.getSizeInBits())
	SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
	return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
	}
	}

	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	SDValue Cmp = Y.getOperand(1);
	if (Cmp.getOpcode() != X86ISD::CMP \|\| !Cmp.hasOneUse() \|\|
	!X86::isZeroNode(Cmp.getOperand(1)) \|\|
	!Cmp.getOperand(0).getValueType().isInteger())
	return SDValue();

	SDValue Z = Cmp.getOperand(0);
	EVT ZVT = Z.getValueType();

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	if (ConstantX) {
	// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
	// fake operands:
	// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
	// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
	if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
	SDValue Zero = DAG.getConstant(0, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	}

	// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
	// with fake operands:
	// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
	// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
	if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
	}
	}

	// (cmp Z, 1) sets the carry flag if Z is 0.
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);

	// Add the flags type for ADC/SBB nodes.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
	// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
	if (CC == X86::COND_NE)
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
	DAG.getConstant(-1ULL, DL, VT), Cmp1);

	// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
	// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
	DAG.getConstant(0, DL, VT), Cmp1);
	}

	static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDValue MulOp = N->getOperand(0);
	SDValue Phi = N->getOperand(1);

	if (MulOp.getOpcode() != ISD::MUL)
	std::swap(MulOp, Phi);
	if (MulOp.getOpcode() != ISD::MUL)
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) \|\| Mode == MULU16)
	return SDValue();

	EVT VT = N->getValueType(0);

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;
	unsigned VectorSize = VT.getVectorNumElements() * 16;
	// If the vector size is less than 128, or greater than the supported RegSize,
	// do not use PMADD.
	if (VectorSize < 128 \|\| VectorSize > RegSize)
	return SDValue();

	SDLoc DL(N);
	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements());
	EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	VT.getVectorNumElements() / 2);

	// Shrink the operands of mul.
	SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
	SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));

	// Madd vector size is half of the original vector size
	SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
	// Fill the rest of the output with 0
	SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
	return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
	}

	static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// TODO: There's nothing special about i32, any integer type above i16 should
	// work just as well.
	if (!VT.isVector() \|\| !VT.isSimple() \|\|
	!(VT.getVectorElementType() == MVT::i32))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;

	// We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (VT.getSizeInBits() / 4 > RegSize)
	return SDValue();

	// We know N is a reduction add, which means one of its operands is a phi.
	// To match SAD, we need the other operand to be a vector select.
	SDValue SelectOp, Phi;
	if (Op0.getOpcode() == ISD::VSELECT) {
	SelectOp = Op0;
	Phi = Op1;
	} else if (Op1.getOpcode() == ISD::VSELECT) {
	SelectOp = Op1;
	Phi = Op0;
	} else
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	if(!detectZextAbsDiff(SelectOp, Op0, Op1))
	return SDValue();

	// SAD pattern detected. Now build a SAD instruction and an addition for
	// reduction. Note that the number of elements of the result of SAD is less
	// than the number of elements of its input. Therefore, we could only update
	// part of elements in the reduction vector.
	SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);

	// The output of PSADBW is a vector of i64.
	// We need to turn the vector of i64 into a vector of i32.
	// If the reduction vector is at least as wide as the psadbw result, just
	// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
	// anyway.
	MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
	if (VT.getSizeInBits() >= ResVT.getSizeInBits())
	Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
	else
	Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);

	if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
	// Fill the upper elements with zero to match the add width.
	SDValue Zero = DAG.getConstant(0, DL, VT);
	Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
	DAG.getIntPtrConstant(0, DL));
	}

	return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
	}

	/// Convert vector increment or decrement to sub/add with an all-ones constant:
	/// add X, <1, 1...> --> sub X, <-1, -1...>
	/// sub X, <1, 1...> --> add X, <-1, -1...>
	/// The all-ones vector constant can be materialized using a pcmpeq instruction
	/// that is commonly recognized as an idiom (has no register dependency), so
	/// that's better/smaller than loading a splat 1 constant.
	static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
	assert((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&
	"Unexpected opcode for increment/decrement transform");

	// Pseudo-legality check: getOnesVector() expects one of these types, so bail
	// out and wait for legalization if we have an unsupported vector length.
	EVT VT = N->getValueType(0);
	if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	SDNode *N1 = N->getOperand(1).getNode();
	APInt SplatVal;
	if (!ISD::isConstantSplatVector(N1, SplatVal) \|\|
	!SplatVal.isOneValue())
	return SDValue();

	SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
	}

	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const SDNodeFlags Flags = N->getFlags();
	if (Flags.hasVectorReduction()) {
	if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
	return Sad;
	if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
	return MAdd;
	}
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// Try to synthesize horizontal adds from adds of shuffles.
	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
	isHorizontalBinOp(Op0, Op1, true))
	return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// PSUBUS is supported, starting from SSE2, but special preprocessing
	// for v8i32 requires umin, which appears in SSE41.
	if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16)) &&
	!(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
	!(Subtarget.hasAVX2() && (VT == MVT::v32i8 \|\| VT == MVT::v16i16)) &&
	!(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
	(VT == MVT::v64i8 \|\| VT == MVT::v32i16 \|\| VT == MVT::v16i32 \|\|
	VT == MVT::v8i64)))
	return SDValue();

	SDValue SubusLHS, SubusRHS;
	// Try to find umax(a,b) - b or a - umin(a,b) patterns
	// they may be converted to subus(a,b).
	// TODO: Need to add IR cannonicialization for this code.
	if (Op0.getOpcode() == ISD::UMAX) {
	SubusRHS = Op1;
	SDValue MaxLHS = Op0.getOperand(0);
	SDValue MaxRHS = Op0.getOperand(1);
	if (MaxLHS == Op1)
	SubusLHS = MaxRHS;
	else if (MaxRHS == Op1)
	SubusLHS = MaxLHS;
	else
	return SDValue();
	} else if (Op1.getOpcode() == ISD::UMIN) {
	SubusLHS = Op0;
	SDValue MinLHS = Op1.getOperand(0);
	SDValue MinRHS = Op1.getOperand(1);
	if (MinLHS == Op0)
	SubusRHS = MinRHS;
	else if (MinRHS == Op0)
	SubusRHS = MinLHS;
	else
	return SDValue();
	} else
	return SDValue();

	// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
	// special preprocessing in some cases.
	if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
	return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);

	// Special preprocessing case can be only applied
	// if the value was zero extended from 16 bit,
	// so we require first 16 bits to be zeros for 32 bit
	// values, or first 48 bits for 64 bit values.
	KnownBits Known;
	DAG.computeKnownBits(SubusLHS, Known);
	unsigned NumZeros = Known.countMinLeadingZeros();
	if ((VT == MVT::v8i64 && NumZeros < 48) \|\| NumZeros < 16)
	return SDValue();

	EVT ExtType = SubusLHS.getValueType();
	EVT ShrinkedType;
	if (VT == MVT::v8i32 \|\| VT == MVT::v8i64)
	ShrinkedType = MVT::v8i16;
	else
	ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;

	// If SubusLHS is zeroextended - truncate SubusRHS to it's
	// size SubusRHS = umin(0xFFF.., SubusRHS).
	SDValue SaturationConst =
	DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
	ShrinkedType.getScalarSizeInBits()),
	SDLoc(SubusLHS), ExtType);
	SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
	SaturationConst);
	SDValue NewSubusLHS =
	DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
	SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
	SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
	NewSubusLHS, NewSubusRHS);
	// Zero extend the result, it may be used somewhere as 32 bit,
	// if not zext and following trunc will shrink.
	return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
	}

	static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// X86 can't encode an immediate LHS of a sub. See if we can push the
	// negation into a preceding instruction.
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
	// If the RHS of the sub is a XOR with one use and a constant, invert the
	// immediate. Then add one to the LHS of the sub so we can turn
	// X-Y -> X+~Y+1, saving one register.
	if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
	isa<ConstantSDNode>(Op1.getOperand(1))) {
	APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
	EVT VT = Op0.getValueType();
	SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
	Op1.getOperand(0),
	DAG.getConstant(~XorC, SDLoc(Op1), VT));
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
	DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
	}
	}

	// Try to synthesize horizontal subs from subs of shuffles.
	EVT VT = N->getValueType(0);
	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
	isHorizontalBinOp(Op0, Op1, false))
	return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	// Try to create PSUBUS if SUB's argument is max/min
	if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize())
	return SDValue();

	SDLoc DL(N);
	unsigned Opcode = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	MVT SVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned EltSizeInBits = SVT.getSizeInBits();

	SDValue Op = N->getOperand(0);
	MVT OpVT = Op.getSimpleValueType();
	MVT OpEltVT = OpVT.getVectorElementType();
	unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
	unsigned InputBits = OpEltSizeInBits * NumElts;

	// Perform any constant folding.
	// FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
	APInt Undefs(NumElts, 0);
	SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
	bool IsZEXT =
	(Opcode == X86ISD::VZEXT) \|\| (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
	for (unsigned i = 0; i != NumElts; ++i) {
	if (UndefElts[i]) {
	Undefs.setBit(i);
	continue;
	}
	Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
	: EltBits[i].sextOrTrunc(EltSizeInBits);
	}
	return getConstVector(Vals, Undefs, VT, DAG, DL);
	}

	// (vzext (bitcast (vzext (x)) -> (vzext x)
	// TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
	SDValue V = peekThroughBitcasts(Op);
	if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
	MVT InnerVT = V.getSimpleValueType();
	MVT InnerEltVT = InnerVT.getVectorElementType();

	// If the element sizes match exactly, we can just do one larger vzext. This
	// is always an exact type match as vzext operates on integer types.
	if (OpEltVT == InnerEltVT) {
	assert(OpVT == InnerVT && "Types must match for vzext!");
	return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
	}

	// The only other way we can combine them is if only a single element of the
	// inner vzext is used in the input to the outer vzext.
	if (InnerEltVT.getSizeInBits() < InputBits)
	return SDValue();

	// In this case, the inner vzext is completely dead because we're going to
	// only look at bits inside of the low element. Just do the outer vzext on
	// a bitcast of the input to the inner.
	return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
	}

	// Check if we can bypass extracting and re-inserting an element of an input
	// vector. Essentially:
	// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
	// TODO: Add X86ISD::VSEXT support
	if (Opcode == X86ISD::VZEXT &&
	V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
	SDValue ExtractedV = V.getOperand(0);
	SDValue OrigV = ExtractedV.getOperand(0);
	if (isNullConstant(ExtractedV.getOperand(1))) {
	MVT OrigVT = OrigV.getSimpleValueType();
	// Extract a subvector if necessary...
	if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
	int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
	OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
	OrigVT.getVectorNumElements() / Ratio);
	OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
	DAG.getIntPtrConstant(0, DL));
	}
	Op = DAG.getBitcast(OpVT, OrigV);
	return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
	}
	}

	return SDValue();
	}

	static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// TEST (AND a, b) ,(AND a, b) -> TEST a, b
	if (Op0 == Op1 && Op1->getOpcode() == ISD::AND)
	return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),
	Op0->getOperand(1));

	// TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)
	// TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)
	if (ISD::isBuildVectorAllZeros(Op0.getNode()) \|\|
	ISD::isBuildVectorAllZeros(Op1.getNode()))
	return getZeroVector(VT, Subtarget, DAG, DL);

	return SDValue();
	}

	static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	if (N->getOperand(0) == N->getOperand(1)) {
	if (N->getOpcode() == X86ISD::PCMPEQ)
	return getOnesVector(VT, DAG, DL);
	if (N->getOpcode() == X86ISD::PCMPGT)
	return getZeroVector(VT, Subtarget, DAG, DL);
	}

	return SDValue();
	}

	static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT OpVT = N->getSimpleValueType(0);

	// Early out for mask vectors.
	if (OpVT.getVectorElementType() == MVT::i1)
	return SDValue();

	SDLoc dl(N);
	SDValue Vec = N->getOperand(0);
	SDValue SubVec = N->getOperand(1);

	unsigned IdxVal = N->getConstantOperandVal(2);
	MVT SubVecVT = SubVec.getSimpleValueType();

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// Inserting zeros into zeros is a nop.
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	return Vec;

	// If we're inserting into a zero vector and then into a larger zero vector,
	// just insert into the larger zero vector directly.
	if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
	unsigned Idx2Val = SubVec.getConstantOperandVal(2);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
	SubVec.getOperand(1),
	DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
	}

	// If we're inserting a bitcast into zeros, rewrite the insert and move the
	// bitcast to the other side. This helps with detecting zero extending
	// during isel.
	// TODO: Is this useful for other indices than 0?
	if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
	MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
	unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
	MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
	SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
	DAG.getBitcast(NewVT, Vec),
	SubVec.getOperand(0), N->getOperand(2));
	return DAG.getBitcast(OpVT, Insert);
	}
	}

	// If this is an insert of an extract, combine to a shuffle. Don't do this
	// if the insert or extract can be represented with a subregister operation.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	SubVec.getOperand(0).getSimpleValueType() == OpVT &&
	(IdxVal != 0 \|\| !Vec.isUndef())) {
	int ExtIdxVal = SubVec.getConstantOperandVal(1);
	if (ExtIdxVal != 0) {
	int VecNumElts = OpVT.getVectorNumElements();
	int SubVecNumElts = SubVecVT.getVectorNumElements();
	SmallVector<int, 64> Mask(VecNumElts);
	// First create an identity shuffle mask.
	for (int i = 0; i != VecNumElts; ++i)
	Mask[i] = i;
	// Now insert the extracted portion.
	for (int i = 0; i != SubVecNumElts; ++i)
	Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

	return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
	}
	}

	// Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
	// load:
	// (insert_subvector (insert_subvector undef, (load16 addr), 0),
	// (load16 addr + 16), Elts/2)
	// --> load32 addr
	// or:
	// (insert_subvector (insert_subvector undef, (load32 addr), 0),
	// (load32 addr + 32), Elts/2)
	// --> load64 addr
	// or a 16-byte or 32-byte broadcast:
	// (insert_subvector (insert_subvector undef, (load16 addr), 0),
	// (load16 addr), Elts/2)
	// --> X86SubVBroadcast(load16 addr)
	// or:
	// (insert_subvector (insert_subvector undef, (load32 addr), 0),
	// (load32 addr), Elts/2)
	// --> X86SubVBroadcast(load32 addr)
	if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
	Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
	auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
	if (Idx2 && Idx2->getZExtValue() == 0) {
	SDValue SubVec2 = Vec.getOperand(1);
	// If needed, look through bitcasts to get to the load.
	if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
	bool Fast;
	unsigned Alignment = FirstLd->getAlignment();
	unsigned AS = FirstLd->getAddressSpace();
	const X86TargetLowering *TLI = Subtarget.getTargetLowering();
	if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
	OpVT, AS, Alignment, &Fast) && Fast) {
	SDValue Ops[] = {SubVec2, SubVec};
	if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
	Subtarget, false))
	return Ld;
	}
	}
	// If lower/upper loads are the same and the only users of the load, then
	// lower to a VBROADCASTF128/VBROADCASTI128/etc.
	if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
	if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
	SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
	return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);

	// If this is subv_broadcast insert into both halves, use a larger
	// subv_broadcast.
	if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
	return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
	SubVec.getOperand(0));

	// If we're inserting all zeros into the upper half, change this to
	// an insert into an all zeros vector. We will match this to a move
	// with implicit upper bit zeroing during isel.
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
	Vec.getOperand(2));

	// If we are inserting into both halves of the vector, the starting
	// vector should be undef. If it isn't, make it so. Only do this if the
	// the early insert has no other uses.
	// TODO: Should this be a generic DAG combine?
	if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
	SubVec2, Vec.getOperand(2));
	DCI.AddToWorklist(Vec.getNode());
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
	N->getOperand(2));

	}
	}
	}

	return SDValue();
	}

	static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT OpVT = N->getSimpleValueType(0);
	SDValue InVec = N->getOperand(0);
	unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();

	if (ISD::isBuildVectorAllZeros(InVec.getNode()))
	return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));

	if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
	if (OpVT.getScalarType() == MVT::i1)
	return DAG.getConstant(1, SDLoc(N), OpVT);
	return getOnesVector(OpVT, DAG, SDLoc(N));
	}

	if (InVec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(
	OpVT, SDLoc(N),
	InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));

	return SDValue();
	}

	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default: break;
	case ISD::EXTRACT_VECTOR_ELT:
	case X86ISD::PEXTRW:
	case X86ISD::PEXTRB:
	return combineExtractVectorElt(N, DAG, DCI, Subtarget);
	case ISD::INSERT_SUBVECTOR:
	return combineInsertSubvector(N, DAG, DCI, Subtarget);
	case ISD::EXTRACT_SUBVECTOR:
	return combineExtractSubvector(N, DAG, DCI, Subtarget);
	case ISD::VSELECT:
	case ISD::SELECT:
	case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
	case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
	case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
	case ISD::ADD: return combineAdd(N, DAG, Subtarget);
	case ISD::SUB: return combineSub(N, DAG, Subtarget);
	case X86ISD::SBB: return combineSBB(N, DAG);
	case X86ISD::ADC: return combineADC(N, DAG, DCI);
	case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
	case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
	case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
	case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
	case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
	case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
	case ISD::STORE: return combineStore(N, DAG, Subtarget);
	case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
	case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
	case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
	case ISD::FADD:
	case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
	case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
	case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
	case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
	case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
	case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
	case X86ISD::FXOR:
	case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
	case X86ISD::FMIN:
	case X86ISD::FMAX: return combineFMinFMax(N, DAG);
	case ISD::FMINNUM:
	case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
	case X86ISD::BT: return combineBT(N, DAG, DCI);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
	case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
	case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
	case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
	case X86ISD::VSHLI:
	case X86ISD::VSRAI:
	case X86ISD::VSRLI:
	return combineVectorShiftImm(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case X86ISD::VSEXT:
	case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
	case X86ISD::SHUFP: // Handle all target specific shuffles
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::BLENDI:
	case X86ISD::UNPCKH:
	case X86ISD::UNPCKL:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLHPS:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::VBROADCAST:
	case X86ISD::VPPERM:
	case X86ISD::VPERMI:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::VZEXT_MOVL:
	case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
	case X86ISD::FMADD_RND:
	case X86ISD::FMADDS1_RND:
	case X86ISD::FMADDS3_RND:
	case X86ISD::FMADDS1:
	case X86ISD::FMADDS3:
	case X86ISD::FMADD4S:
	case ISD::FMA: return combineFMA(N, DAG, Subtarget);
	case X86ISD::FMADDSUB_RND:
	case X86ISD::FMSUBADD_RND:
	case X86ISD::FMADDSUB:
	case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
	case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
	case X86ISD::MGATHER:
	case X86ISD::MSCATTER:
	case ISD::MGATHER:
	case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
	case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget);
	case X86ISD::PCMPEQ:
	case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
	}

	return SDValue();
	}

	/// Return true if the target has native support for the specified value type
	/// and it is 'desirable' to use the type for the given node type. e.g. On x86
	/// i16 is legal, but undesirable since i16 instruction encodings are longer and
	/// some i16 instructions are slow.
	bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
	if (!isTypeLegal(VT))
	return false;
	if (VT != MVT::i16)
	return true;

	switch (Opc) {
	default:
	return true;
	case ISD::LOAD:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SUB:
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	return false;
	}
	}

	/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
	/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
	/// we don't adjust the stack we clobber the first frame index.
	/// See X86InstrInfo::copyPhysReg.
	static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	return any_of(MRI.reg_instructions(X86::EFLAGS),
	[](const MachineInstr &RI) { return RI.isCopy(); });
	}

	void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
	if (hasCopyImplyingStackAdjustment(MF)) {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	}

	TargetLoweringBase::finalizeLowering(MF);
	}

	/// This method query the target whether it is beneficial for dag combiner to
	/// promote the specified node. If true, it should return the desired promotion
	/// type by reference.
	bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
	EVT VT = Op.getValueType();
	if (VT != MVT::i16)
	return false;

	bool Promote = false;
	bool Commute = false;
	switch (Op.getOpcode()) {
	default: break;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	Promote = true;
	break;
	case ISD::SHL:
	case ISD::SRL: {
	SDValue N0 = Op.getOperand(0);
	// Look out for (store (shl (load), x)).
	if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
	return false;
	Promote = true;
	break;
	}
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	Commute = true;
	LLVM_FALLTHROUGH;
	case ISD::SUB: {
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	if (!Commute && MayFoldLoad(N1))
	return false;
	// Avoid disabling potential load folding opportunities.
	if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) \|\| MayFoldIntoStore(Op)))
	return false;
	if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) \|\| MayFoldIntoStore(Op)))
	return false;
	Promote = true;
	}
	}

	PVT = MVT::i32;
	return Promote;
	}

	bool X86TargetLowering::
	isDesirableToCombineBuildVectorToShuffleTruncate(
	ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {

	assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
	"Element count mismatch");
	assert(
	Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
	"Shuffle Mask expected to be legal");

	// For 32-bit elements VPERMD is better than shuffle+truncate.
	// TODO: After we improve lowerBuildVector, add execption for VPERMW.
	if (SrcVT.getScalarSizeInBits() == 32 \|\| !Subtarget.hasAVX2())
	return false;

	if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
	return false;

	return true;
	}

	//===----------------------------------------------------------------------===//
	// X86 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Helper to match a string separated by whitespace.
	static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
	S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

	for (StringRef Piece : Pieces) {
	if (!S.startswith(Piece)) // Check if the piece matches.
	return false;

	S = S.substr(Piece.size());
	StringRef::size_type Pos = S.find_first_not_of(" \t");
	if (Pos == 0) // We matched a prefix.
	return false;

	S = S.substr(Pos);
	}

	return S.empty();
	}

	static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

	if (AsmPieces.size() == 3 \|\| AsmPieces.size() == 4) {
	if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {

	if (AsmPieces.size() == 3)
	return true;
	else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
	return true;
	}
	}
	return false;
	}

	bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
	InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());

	const std::string &AsmStr = IA->getAsmString();

	IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
	if (!Ty \|\| Ty->getBitWidth() % 16 != 0)
	return false;

	// TODO: should remove alternatives from the asmstring: "foo {a\|b}" -> "foo a"
	SmallVector<StringRef, 4> AsmPieces;
	SplitString(AsmStr, AsmPieces, ";\n");

	switch (AsmPieces.size()) {
	default: return false;
	case 1:
	// FIXME: this should verify that we are targeting a 486 or better. If not,
	// we will turn this bswap into something that will be lowered to logical
	// ops instead of emitting the bswap asm. For now, we don't support 486 or
	// lower so don't worry about this.
	// bswap $0
	if (matchAsm(AsmPieces[0], {"bswap", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
	// No need to check constraints, nothing other than the equivalent of
	// "=r,0" would be valid here.
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	// rorw $$8, ${0:w} --> llvm.bswap.i16
	if (CI->getType()->isIntegerTy(16) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) \|\|
	matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	break;
	case 3:
	if (CI->getType()->isIntegerTy(32) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
	matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
	matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	if (CI->getType()->isIntegerTy(64)) {
	InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
	if (Constraints.size() >= 2 &&
	Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
	Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
	// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
	if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
	matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
	matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	}
	break;
	}
	return false;
	}

	/// Given a constraint letter, return the type of constraint for this target.
	X86TargetLowering::ConstraintType
	X86TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'R':
	case 'q':
	case 'Q':
	case 'f':
	case 't':
	case 'u':
	case 'y':
	case 'x':
	case 'v':
	case 'Y':
	case 'l':
	case 'k': // AVX512 masking registers.
	return C_RegisterClass;
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	return C_Register;
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'G':
	case 'C':
	case 'e':
	case 'Z':
	return C_Other;
	default:
	break;
	}
	}
	else if (Constraint.size() == 2) {
	switch (Constraint[0]) {
	default:
	break;
	case 'Y':
	switch (Constraint[1]) {
	default:
	break;
	case 'z':
	case '0':
	return C_Register;
	case 'i':
	case 'm':
	case 'k':
	case 't':
	case '2':
	return C_RegisterClass;
	}
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	X86TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	LLVM_FALLTHROUGH;
	case 'R':
	case 'q':
	case 'Q':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	if (CallOperandVal->getType()->isIntegerTy())
	weight = CW_SpecificReg;
	break;
	case 'f':
	case 't':
	case 'u':
	if (type->isFloatingPointTy())
	weight = CW_SpecificReg;
	break;
	case 'y':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	weight = CW_SpecificReg;
	break;
	case 'Y': {
	unsigned Size = StringRef(constraint).size();
	// Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
	char NextChar = Size == 2 ? constraint[1] : 'i';
	if (Size > 2)
	break;
	switch (NextChar) {
	default:
	return CW_Invalid;
	// XMM0
	case 'z':
	case '0':
	if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
	return CW_SpecificReg;
	return CW_Invalid;
	// Conditional OpMask regs (AVX512)
	case 'k':
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	return CW_Register;
	return CW_Invalid;
	// Any MMX reg
	case 'm':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	return weight;
	return CW_Invalid;
	// Any SSE reg when ISA >= SSE2, same as 'Y'
	case 'i':
	case 't':
	case '2':
	if (!Subtarget.hasSSE2())
	return CW_Invalid;
	break;
	}
	// Fall through (handle "Y" constraint).
	LLVM_FALLTHROUGH;
	}
	case 'v':
	if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
	weight = CW_Register;
	LLVM_FALLTHROUGH;
	case 'x':
	if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) \|\|
	((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
	weight = CW_Register;
	break;
	case 'k':
	// Enable conditional vector operations using %k<#> registers.
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	weight = CW_Register;
	break;
	case 'I':
	if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
	if (C->getZExtValue() <= 31)
	weight = CW_Constant;
	}
	break;
	case 'J':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 63)
	weight = CW_Constant;
	}
	break;
	case 'K':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
	weight = CW_Constant;
	}
	break;
	case 'L':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getZExtValue() == 0xff) \|\| (C->getZExtValue() == 0xffff))
	weight = CW_Constant;
	}
	break;
	case 'M':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 3)
	weight = CW_Constant;
	}
	break;
	case 'N':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xff)
	weight = CW_Constant;
	}
	break;
	case 'G':
	case 'C':
	if (isa<ConstantFP>(CallOperandVal)) {
	weight = CW_Constant;
	}
	break;
	case 'e':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80000000LL) &&
	(C->getSExtValue() <= 0x7fffffffLL))
	weight = CW_Constant;
	}
	break;
	case 'Z':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xffffffff)
	weight = CW_Constant;
	}
	break;
	}
	return weight;
	}

	/// Try to replace an X constraint, which matches anything, with another that
	/// has more specific requirements based on the type of the corresponding
	/// operand.
	const char *X86TargetLowering::
	LowerXConstraint(EVT ConstraintVT) const {
	// FP X constraints get lowered to SSE1/2 registers if available, otherwise
	// 'f' like normal targets.
	if (ConstraintVT.isFloatingPoint()) {
	if (Subtarget.hasSSE2())
	return "Y";
	if (Subtarget.hasSSE1())
	return "x";
	}

	return TargetLowering::LowerXConstraint(ConstraintVT);
	}

	/// Lower the specified operand into the Ops vector.
	/// If it is invalid, don't add anything to Ops.
	void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Only support length 1 constraints for now.
	if (Constraint.length() > 1) return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break;
	case 'I':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 31) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'J':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 63) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'K':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (isInt<8>(C->getSExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'L':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() == 0xff \|\| C->getZExtValue() == 0xffff \|\|
	(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'M':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 3) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'N':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 255) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'O':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 127) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'e': {
	// 32-bit signed value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getSExtValue())) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	}
	return;
	}
	case 'Z': {
	// 32-bit unsigned value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getZExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	return;
	}
	case 'i': {
	// Literal immediates are always ok.
	if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}

	// In any sort of PIC mode addresses need to be computed at runtime by
	// adding in a register or some sort of table lookup. These can't
	// be used as immediates.
	if (Subtarget.isPICStyleGOT() \|\| Subtarget.isPICStyleStubPIC())
	return;

	// If we are in non-pic codegen mode, we allow the address of a global (with
	// an optional displacement) to be used with 'i'.
	GlobalAddressSDNode *GA = nullptr;
	int64_t Offset = 0;

	// Match either (GA), (GA+C), (GA+C1+C2), etc.
	while (1) {
	if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
	Offset += GA->getOffset();
	break;
	} else if (Op.getOpcode() == ISD::ADD) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	Offset += C->getZExtValue();
	Op = Op.getOperand(0);
	continue;
	}
	} else if (Op.getOpcode() == ISD::SUB) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	Offset += -C->getZExtValue();
	Op = Op.getOperand(0);
	continue;
	}
	}

	// Otherwise, this isn't something we can handle, reject it.
	return;
	}

	const GlobalValue *GV = GA->getGlobal();
	// If we require an extra load to get this address, as in PIC mode, we
	// can't accept it.
	if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
	return;

	Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
	GA->getValueType(0), Offset);
	break;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}
	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	/// Check if \p RC is a general purpose register class.
	/// I.e., GR* or one of their variant.
	static bool isGRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::GR8RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR16RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR32RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR64RegClass) \|\|
	RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
	}

	/// Check if \p RC is a vector register class.
	/// I.e., FR* / VR* or one of their variant.
	static bool isFRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::FR32XRegClass) \|\|
	RC.hasSuperClassEq(&X86::FR64XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR128XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR256XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR512RegClass);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// First, see if this is a constraint that directly corresponds to an LLVM
	// register class.
	if (Constraint.size() == 1) {
	// GCC Constraint Letters
	switch (Constraint[0]) {
	default: break;
	// TODO: Slight differences here in allocation order and leaving
	// RIP in the class. Do they matter any more here than they do
	// in the normal allocation?
	case 'k':
	if (Subtarget.hasAVX512()) {
	// Only supported in AVX512 or later.
	switch (VT.SimpleTy) {
	default: break;
	case MVT::i32:
	return std::make_pair(0U, &X86::VK32RegClass);
	case MVT::i16:
	return std::make_pair(0U, &X86::VK16RegClass);
	case MVT::i8:
	return std::make_pair(0U, &X86::VK8RegClass);
	case MVT::i1:
	return std::make_pair(0U, &X86::VK1RegClass);
	case MVT::i64:
	return std::make_pair(0U, &X86::VK64RegClass);
	}
	}
	break;
	case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
	if (Subtarget.is64Bit()) {
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i64 \|\| VT == MVT::f64)
	return std::make_pair(0U, &X86::GR64RegClass);
	break;
	}
	LLVM_FALLTHROUGH;
	// 32-bit fallthrough
	case 'Q': // Q_REGS
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32_ABCDRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_ABCDRegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::GR64_ABCDRegClass);
	break;
	case 'r': // GENERAL_REGS
	case 'l': // INDEX_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32RegClass);
	return std::make_pair(0U, &X86::GR64RegClass);
	case 'R': // LEGACY_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_NOREXRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_NOREXRegClass);
	if (VT == MVT::i32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32_NOREXRegClass);
	return std::make_pair(0U, &X86::GR64_NOREXRegClass);
	case 'f': // FP Stack registers.
	// If SSE is enabled for this VT, use f80 to ensure the isel moves the
	// value to the correct fpstack register class.
	if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP32RegClass);
	if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP64RegClass);
	return std::make_pair(0U, &X86::RFP80RegClass);
	case 'y': // MMX_REGS if MMX allowed.
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'Y': // SSE_REGS if SSE2 allowed
	if (!Subtarget.hasSSE2()) break;
	LLVM_FALLTHROUGH;
	case 'v':
	case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
	if (!Subtarget.hasSSE1()) break;
	bool VConstraint = (Constraint[0] == 'v');

	switch (VT.SimpleTy) {
	default: break;
	// Scalar SSE types.
	case MVT::f32:
	case MVT::i32:
	if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR32XRegClass);
	return std::make_pair(0U, &X86::FR32RegClass);
	case MVT::f64:
	case MVT::i64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR64XRegClass);
	return std::make_pair(0U, &X86::FR64RegClass);
	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	// Vector types.
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v2f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR128XRegClass);
	return std::make_pair(0U, &X86::VR128RegClass);
	// AVX types.
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v8f32:
	case MVT::v4f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR256XRegClass);
	return std::make_pair(0U, &X86::VR256RegClass);
	case MVT::v8f64:
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8i64:
	return std::make_pair(0U, &X86::VR512RegClass);
	}
	break;
	}
	} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
	switch (Constraint[1]) {
	default:
	break;
	case 'i':
	case 't':
	case '2':
	return getRegForInlineAsmConstraint(TRI, "Y", VT);
	case 'm':
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'z':
	case '0':
	if (!Subtarget.hasSSE1()) break;
	return std::make_pair(X86::XMM0, &X86::VR128RegClass);
	case 'k':
	// This register class doesn't allocate k0 for masked vector operation.
	if (Subtarget.hasAVX512()) { // Only supported in AVX512.
	switch (VT.SimpleTy) {
	default: break;
	case MVT::i32:
	return std::make_pair(0U, &X86::VK32WMRegClass);
	case MVT::i16:
	return std::make_pair(0U, &X86::VK16WMRegClass);
	case MVT::i8:
	return std::make_pair(0U, &X86::VK8WMRegClass);
	case MVT::i1:
	return std::make_pair(0U, &X86::VK1WMRegClass);
	case MVT::i64:
	return std::make_pair(0U, &X86::VK64WMRegClass);
	}
	}
	break;
	}
	}

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass*> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	// Map st(0) -> st(7) -> ST0
	if (Constraint.size() == 7 && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 's' &&
	tolower(Constraint[2]) == 't' &&
	Constraint[3] == '(' &&
	(Constraint[4] >= '0' && Constraint[4] <= '7') &&
	Constraint[5] == ')' &&
	Constraint[6] == '}') {

	Res.first = X86::FP0+Constraint[4]-'0';
	Res.second = &X86::RFP80RegClass;
	return Res;
	}

	// GCC allows "st(0)" to be called just plain "st".
	if (StringRef("{st}").equals_lower(Constraint)) {
	Res.first = X86::FP0;
	Res.second = &X86::RFP80RegClass;
	return Res;
	}

	// flags -> EFLAGS
	if (StringRef("{flags}").equals_lower(Constraint)) {
	Res.first = X86::EFLAGS;
	Res.second = &X86::CCRRegClass;
	return Res;
	}

	// 'A' means [ER]AX + [ER]DX.
	if (Constraint == "A") {
	if (Subtarget.is64Bit()) {
	Res.first = X86::RAX;
	Res.second = &X86::GR64_ADRegClass;
	} else {
	assert((Subtarget.is32Bit() \|\| Subtarget.is16Bit()) &&
	"Expecting 64, 32 or 16 bit subtarget");
	Res.first = X86::EAX;
	Res.second = &X86::GR32_ADRegClass;
	}
	return Res;
	}
	return Res;
	}

	// Otherwise, check to see if this is a register class of the wrong value
	// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
	// turn into {ax},{dx}.
	// MVT::Other is used to specify clobber names.
	if (TRI->isTypeLegalForClass(*Res.second, VT) \|\| VT == MVT::Other)
	return Res; // Correct type already, nothing to do.

	// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
	// return "eax". This should even work for things like getting 64bit integer
	// registers when given an f64 type.
	const TargetRegisterClass *Class = Res.second;
	// The generic code will match the first register class that contains the
	// given register. Thus, based on the ordering of the tablegened file,
	// the "plain" GR classes might not come first.
	// Therefore, use a helper method.
	if (isGRClass(*Class)) {
	unsigned Size = VT.getSizeInBits();
	if (Size == 1) Size = 8;
	unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
	if (DestReg > 0) {
	bool is64Bit = Subtarget.is64Bit();
	const TargetRegisterClass *RC =
	Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
	: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
	: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
	: &X86::GR64RegClass;
	if (RC->contains(DestReg))
	Res = std::make_pair(DestReg, RC);
	} else {
	// No register found/type mismatch.
	Res.first = 0;
	Res.second = nullptr;
	}
	} else if (isFRClass(*Class)) {
	// Handle references to XMM physical registers that got mapped into the
	// wrong class. This can happen with constraints like {xmm0} where the
	// target independent register mapper will just pick the first match it can
	// find, ignoring the required type.

	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	Res.second = &X86::FR32RegClass;
	else if (VT == MVT::f64 \|\| VT == MVT::i64)
	Res.second = &X86::FR64RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
	Res.second = &X86::VR128RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
	Res.second = &X86::VR256RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
	Res.second = &X86::VR512RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	}

	return Res;
	}

	int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
	// will take 2 allocations in the out of order engine instead of 1
	// for plain addressing mode, i.e. inst (reg1).
	// E.g.,
	// vaddps (%rsi,%drx), %ymm0, %ymm1
	// Requires two allocations (one for the load, one for the computation)
	// whereas:
	// vaddps (%rsi), %ymm0, %ymm1
	// Requires just 1 allocation, i.e., freeing allocations for other operations
	// and having less micro operations to execute.
	//
	// For some X86 architectures, this is even worse because for instance for
	// stores, the complex addressing mode forces the instruction to use the
	// "load" ports instead of the dedicated "store" port.
	// E.g., on Haswell:
	// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
	// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1
	// as soon as we use a second register.
	return AM.Scale != 0;
	return -1;
	}

	bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on x86 is expensive. However, when aggressively optimizing
	// for code size, we prefer to use a div instruction, as it is usually smaller
	// than the alternative sequence.
	// The exception to this is vector division. Since x86 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize =
	Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	if (!Subtarget.is64Bit())
	return;

	// Update IsSplitCSR in X86MachineFunctionInfo.
	X86MachineFunctionInfo *AFI =
	Entry->getParent()->getInfo<X86MachineFunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void X86TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (X86::GR64RegClass.contains(*I))
	RC = &X86::GR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction().hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool X86TargetLowering::supportSwiftError() const {
	return Subtarget.is64Bit();
	}

	/// Returns the name of the symbol used to emit stack probes or the empty
	/// string if not applicable.
	StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
	// If the function specifically requests stack probes, emit them.
	if (MF.getFunction().hasFnAttribute("probe-stack"))
	return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

	// Generally, if we aren't on Windows, the platform ABI does not include
	// support for stack probes, so don't emit them.
	if (!Subtarget.isOSWindows() \|\| Subtarget.isTargetMachO())
	return "";

	// We need a stack probe to conform to the Windows ABI. Choose the right
	// symbol.
	if (Subtarget.is64Bit())
	return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
	return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
	}
	Index: head/contrib/llvm/lib/Target/X86/X86ISelLowering.h
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86ISelLowering.h (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86ISelLowering.h (revision 328817)
	@@ -1,1518 +1,1524 @@
	//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that X86 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
	#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H

	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/Target/TargetOptions.h"

	namespace llvm {
	class X86Subtarget;
	class X86TargetMachine;

	namespace X86ISD {
	// X86 Specific DAG Nodes
	enum NodeType : unsigned {
	// Start the numbering where the builtin ops leave off.
	FIRST_NUMBER = ISD::BUILTIN_OP_END,

	/// Bit scan forward.
	BSF,
	/// Bit scan reverse.
	BSR,

	/// Double shift instructions. These correspond to
	/// X86::SHLDxx and X86::SHRDxx instructions.
	SHLD,
	SHRD,

	/// Bitwise logical AND of floating point values. This corresponds
	/// to X86::ANDPS or X86::ANDPD.
	FAND,

	/// Bitwise logical OR of floating point values. This corresponds
	/// to X86::ORPS or X86::ORPD.
	FOR,

	/// Bitwise logical XOR of floating point values. This corresponds
	/// to X86::XORPS or X86::XORPD.
	FXOR,

	/// Bitwise logical ANDNOT of floating point values. This
	/// corresponds to X86::ANDNPS or X86::ANDNPD.
	FANDN,

	/// These operations represent an abstract X86 call
	/// instruction, which includes a bunch of information. In particular the
	/// operands of these node are:
	///
	/// #0 - The incoming token chain
	/// #1 - The callee
	/// #2 - The number of arg bytes the caller pushes on the stack.
	/// #3 - The number of arg bytes the callee pops off the stack.
	/// #4 - The value to pass in AL/AX/EAX (optional)
	/// #5 - The value to pass in DL/DX/EDX (optional)
	///
	/// The result values of these nodes are:
	///
	/// #0 - The outgoing token chain
	/// #1 - The first register result value (optional)
	/// #2 - The second register result value (optional)
	///
	CALL,

	/// This operation implements the lowering for readcyclecounter.
	RDTSC_DAG,

	/// X86 Read Time-Stamp Counter and Processor ID.
	RDTSCP_DAG,

	/// X86 Read Performance Monitoring Counters.
	RDPMC_DAG,

	/// X86 compare and logical compare instructions.
	CMP, COMI, UCOMI,

	/// X86 bit-test instructions.
	BT,

	/// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
	/// operand, usually produced by a CMP instruction.
	SETCC,

	/// X86 Select
	SELECT, SELECTS,

	// Same as SETCC except it's materialized with a sbb and the value is all
	// one's or all zero's.
	SETCC_CARRY, // R = carry_bit ? ~0 : 0

	/// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
	/// Operands are two FP values to compare; result is a mask of
	/// 0s or 1s. Generally DTRT for C/C++ with NaNs.
	FSETCC,

	/// X86 FP SETCC, similar to above, but with output as an i1 mask and
	/// with optional rounding mode.
	FSETCCM, FSETCCM_RND,

	/// X86 conditional moves. Operand 0 and operand 1 are the two values
	/// to select from. Operand 2 is the condition code, and operand 3 is the
	/// flag operand produced by a CMP or TEST instruction. It also writes a
	/// flag result.
	CMOV,

	/// X86 conditional branches. Operand 0 is the chain operand, operand 1
	/// is the block to branch if condition is true, operand 2 is the
	/// condition code, and operand 3 is the flag operand produced by a CMP
	/// or TEST instruction.
	BRCOND,

	/// Return with a flag operand. Operand 0 is the chain operand, operand
	/// 1 is the number of bytes of stack to pop.
	RET_FLAG,

	/// Return from interrupt. Operand 0 is the number of bytes to pop.
	IRET,

	/// Repeat fill, corresponds to X86::REP_STOSx.
	REP_STOS,

	/// Repeat move, corresponds to X86::REP_MOVSx.
	REP_MOVS,

	/// On Darwin, this node represents the result of the popl
	/// at function entry, used for PIC code.
	GlobalBaseReg,

	/// A wrapper node for TargetConstantPool, TargetJumpTable,
	/// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
	/// MCSymbol and TargetBlockAddress.
	Wrapper,

	/// Special wrapper used under X86-64 PIC mode for RIP
	/// relative displacements.
	WrapperRIP,

	/// Copies a 64-bit value from the low word of an XMM vector
	/// to an MMX vector.
	MOVDQ2Q,

	/// Copies a 32-bit value from the low word of a MMX
	/// vector to a GPR.
	MMX_MOVD2W,

	/// Copies a GPR into the low 32-bit word of a MMX vector
	/// and zero out the high word.
	MMX_MOVW2D,

	/// Extract an 8-bit value from a vector and zero extend it to
	/// i32, corresponds to X86::PEXTRB.
	PEXTRB,

	/// Extract a 16-bit value from a vector and zero extend it to
	/// i32, corresponds to X86::PEXTRW.
	PEXTRW,

	/// Insert any element of a 4 x float vector into any element
	/// of a destination 4 x floatvector.
	INSERTPS,

	/// Insert the lower 8-bits of a 32-bit value to a vector,
	/// corresponds to X86::PINSRB.
	PINSRB,

	/// Insert the lower 16-bits of a 32-bit value to a vector,
	/// corresponds to X86::PINSRW.
	PINSRW,

	/// Shuffle 16 8-bit values within a vector.
	PSHUFB,

	/// Compute Sum of Absolute Differences.
	PSADBW,
	/// Compute Double Block Packed Sum-Absolute-Differences
	DBPSADBW,

	/// Bitwise Logical AND NOT of Packed FP values.
	ANDNP,

	/// Blend where the selector is an immediate.
	BLENDI,

	/// Dynamic (non-constant condition) vector blend where only the sign bits
	/// of the condition elements are used. This is used to enforce that the
	/// condition mask is not valid for generic VSELECT optimizations.
	SHRUNKBLEND,

	/// Combined add and sub on an FP vector.
	ADDSUB,

	// FP vector ops with rounding mode.
	FADD_RND, FADDS_RND,
	FSUB_RND, FSUBS_RND,
	FMUL_RND, FMULS_RND,
	FDIV_RND, FDIVS_RND,
	FMAX_RND, FMAXS_RND,
	FMIN_RND, FMINS_RND,
	FSQRT_RND, FSQRTS_RND,

	// FP vector get exponent.
	FGETEXP_RND, FGETEXPS_RND,
	// Extract Normalized Mantissas.
	VGETMANT, VGETMANT_RND, VGETMANTS, VGETMANTS_RND,
	// FP Scale.
	SCALEF,
	SCALEFS,

	// Integer add/sub with unsigned saturation.
	ADDUS,
	SUBUS,

	// Integer add/sub with signed saturation.
	ADDS,
	SUBS,

	// Unsigned Integer average.
	AVG,

	/// Integer horizontal add/sub.
	HADD,
	HSUB,

	/// Floating point horizontal add/sub.
	FHADD,
	FHSUB,

	// Detect Conflicts Within a Vector
	CONFLICT,

	/// Floating point max and min.
	FMAX, FMIN,

	/// Commutative FMIN and FMAX.
	FMAXC, FMINC,

	/// Scalar intrinsic floating point max and min.
	FMAXS, FMINS,

	/// Floating point reciprocal-sqrt and reciprocal approximation.
	/// Note that these typically require refinement
	/// in order to obtain suitable precision.
	FRSQRT, FRCP,

	// AVX-512 reciprocal approximations with a little more precision.
	RSQRT14, RSQRT14S, RCP14, RCP14S,

	// Thread Local Storage.
	TLSADDR,

	// Thread Local Storage. A call to get the start address
	// of the TLS block for the current module.
	TLSBASEADDR,

	// Thread Local Storage. When calling to an OS provided
	// thunk at the address from an earlier relocation.
	TLSCALL,

	// Exception Handling helpers.
	EH_RETURN,

	// SjLj exception handling setjmp.
	EH_SJLJ_SETJMP,

	// SjLj exception handling longjmp.
	EH_SJLJ_LONGJMP,

	// SjLj exception handling dispatch.
	EH_SJLJ_SETUP_DISPATCH,

	/// Tail call return. See X86TargetLowering::LowerCall for
	/// the list of operands.
	TC_RETURN,

	// Vector move to low scalar and zero higher vector elements.
	VZEXT_MOVL,

	// Vector integer zero-extend.
	VZEXT,
	// Vector integer signed-extend.
	VSEXT,

	// Vector integer truncate.
	VTRUNC,
	// Vector integer truncate with unsigned/signed saturation.
	VTRUNCUS, VTRUNCS,

	// Vector FP extend.
	VFPEXT, VFPEXT_RND, VFPEXTS_RND,

	// Vector FP round.
	VFPROUND, VFPROUND_RND, VFPROUNDS_RND,

	// Convert a vector to mask, set bits base on MSB.
	CVT2MASK,

	// 128-bit vector logical left / right shift
	VSHLDQ, VSRLDQ,

	// Vector shift elements
	VSHL, VSRL, VSRA,

	// Vector variable shift right arithmetic.
	// Unlike ISD::SRA, in case shift count greater then element size
	// use sign bit to fill destination data element.
	VSRAV,

	// Vector shift elements by immediate
	VSHLI, VSRLI, VSRAI,

	// Shifts of mask registers.
	KSHIFTL, KSHIFTR,

	// Bit rotate by immediate
	VROTLI, VROTRI,

	// Vector packed double/float comparison.
	CMPP,

	// Vector integer comparisons.
	PCMPEQ, PCMPGT,
	// Vector integer comparisons, the result is in a mask vector.
	PCMPEQM, PCMPGTM,

	// v8i16 Horizontal minimum and position.
	PHMINPOS,

	MULTISHIFT,

	/// Vector comparison generating mask bits for fp and
	/// integer signed and unsigned data types.
	CMPM,
	CMPMU,
	// Vector comparison with rounding mode for FP values
	CMPM_RND,

	// Arithmetic operations with FLAGS results.
	ADD, SUB, ADC, SBB, SMUL,
	INC, DEC, OR, XOR, AND,

	// LOW, HI, FLAGS = umul LHS, RHS.
	UMUL,

	// 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS.
	SMUL8, UMUL8,

	// 8-bit divrem that zero-extend the high result (AH).
	UDIVREM8_ZEXT_HREG,
	SDIVREM8_SEXT_HREG,

	// X86-specific multiply by immediate.
	MUL_IMM,

	// Vector sign bit extraction.
	MOVMSK,

	// Vector bitwise comparisons.
	PTEST,

	// Vector packed fp sign bitwise comparisons.
	TESTP,

	// Vector "test" in AVX-512, the result is in a mask vector.
	TESTM,
	TESTNM,

	// OR/AND test for masks.
	KORTEST,
	KTEST,

	// Several flavors of instructions with vector shuffle behaviors.
	// Saturated signed/unnsigned packing.
	PACKSS,
	PACKUS,
	// Intra-lane alignr.
	PALIGNR,
	// AVX512 inter-lane alignr.
	VALIGN,
	PSHUFD,
	PSHUFHW,
	PSHUFLW,
	SHUFP,
	// VBMI2 Concat & Shift.
	VSHLD,
	VSHRD,
	VSHLDV,
	VSHRDV,
	//Shuffle Packed Values at 128-bit granularity.
	SHUF128,
	MOVDDUP,
	MOVSHDUP,
	MOVSLDUP,
	MOVLHPS,
	MOVHLPS,
	MOVLPS,
	MOVLPD,
	MOVSD,
	MOVSS,
	UNPCKL,
	UNPCKH,
	VPERMILPV,
	VPERMILPI,
	VPERMI,
	VPERM2X128,

	// Variable Permute (VPERM).
	// Res = VPERMV MaskV, V0
	VPERMV,

	// 3-op Variable Permute (VPERMT2).
	// Res = VPERMV3 V0, MaskV, V1
	VPERMV3,

	// 3-op Variable Permute overwriting the index (VPERMI2).
	// Res = VPERMIV3 V0, MaskV, V1
	VPERMIV3,

	// Bitwise ternary logic.
	VPTERNLOG,
	// Fix Up Special Packed Float32/64 values.
	VFIXUPIMM,
	VFIXUPIMMS,
	// Range Restriction Calculation For Packed Pairs of Float32/64 values.
	VRANGE, VRANGE_RND, VRANGES, VRANGES_RND,
	// Reduce - Perform Reduction Transformation on scalar\packed FP.
	VREDUCE, VREDUCE_RND, VREDUCES, VREDUCES_RND,
	// RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
	// Also used by the legacy (V)ROUND intrinsics where we mask out the
	// scaling part of the immediate.
	VRNDSCALE, VRNDSCALE_RND, VRNDSCALES, VRNDSCALES_RND,
	// Tests Types Of a FP Values for packed types.
	VFPCLASS,
	// Tests Types Of a FP Values for scalar types.
	VFPCLASSS,

	// Broadcast scalar to vector.
	VBROADCAST,
	// Broadcast mask to vector.
	VBROADCASTM,
	// Broadcast subvector to vector.
	SUBV_BROADCAST,

	/// SSE4A Extraction and Insertion.
	EXTRQI, INSERTQI,

	// XOP arithmetic/logical shifts.
	VPSHA, VPSHL,
	// XOP signed/unsigned integer comparisons.
	VPCOM, VPCOMU,
	// XOP packed permute bytes.
	VPPERM,
	// XOP two source permutation.
	VPERMIL2,

	// Vector multiply packed unsigned doubleword integers.
	PMULUDQ,
	// Vector multiply packed signed doubleword integers.
	PMULDQ,
	// Vector Multiply Packed UnsignedIntegers with Round and Scale.
	MULHRS,

	// Multiply and Add Packed Integers.
	VPMADDUBSW, VPMADDWD,

	// AVX512IFMA multiply and add.
	// NOTE: These are different than the instruction and perform
	// op0 x op1 + op2.
	VPMADD52L, VPMADD52H,

	// VNNI
	VPDPBUSD,
	VPDPBUSDS,
	VPDPWSSD,
	VPDPWSSDS,

	// FMA nodes.
	// We use the target independent ISD::FMA for the non-inverted case.
	FNMADD,
	FMSUB,
	FNMSUB,
	FMADDSUB,
	FMSUBADD,

	// FMA with rounding mode.
	FMADD_RND,
	FNMADD_RND,
	FMSUB_RND,
	FNMSUB_RND,
	FMADDSUB_RND,
	FMSUBADD_RND,

	// FMA4 specific scalar intrinsics bits that zero the non-scalar bits.
	FMADD4S, FNMADD4S, FMSUB4S, FNMSUB4S,

	// Scalar intrinsic FMA.
	FMADDS1, FMADDS3,
	FNMADDS1, FNMADDS3,
	FMSUBS1, FMSUBS3,
	FNMSUBS1, FNMSUBS3,

	// Scalar intrinsic FMA with rounding mode.
	// Two versions, passthru bits on op1 or op3.
	FMADDS1_RND, FMADDS3_RND,
	FNMADDS1_RND, FNMADDS3_RND,
	FMSUBS1_RND, FMSUBS3_RND,
	FNMSUBS1_RND, FNMSUBS3_RND,

	// Compress and expand.
	COMPRESS,
	EXPAND,

	// Bits shuffle
	VPSHUFBITQMB,

	// Convert Unsigned/Integer to Floating-Point Value with rounding mode.
	SINT_TO_FP_RND, UINT_TO_FP_RND,
	SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,

	// Vector float/double to signed/unsigned integer.
	CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND,
	// Scalar float/double to signed/unsigned integer.
	CVTS2SI_RND, CVTS2UI_RND,

	// Vector float/double to signed/unsigned integer with truncation.
	CVTTP2SI, CVTTP2UI, CVTTP2SI_RND, CVTTP2UI_RND,
	// Scalar float/double to signed/unsigned integer with truncation.
	CVTTS2SI_RND, CVTTS2UI_RND,

	// Vector signed/unsigned integer to float/double.
	CVTSI2P, CVTUI2P,

	// Save xmm argument registers to the stack, according to %al. An operator
	// is needed so that this can be expanded with control flow.
	VASTART_SAVE_XMM_REGS,

	// Windows's _chkstk call to do stack probing.
	WIN_ALLOCA,

	// For allocating variable amounts of stack space when using
	// segmented stacks. Check if the current stacklet has enough space, and
	// falls back to heap allocation if not.
	SEG_ALLOCA,

	// Memory barriers.
	MEMBARRIER,
	MFENCE,

	// Store FP status word into i16 register.
	FNSTSW16r,

	// Store contents of %ah into %eflags.
	SAHF,

	// Get a random integer and indicate whether it is valid in CF.
	RDRAND,

	// Get a NIST SP800-90B & C compliant random integer and
	// indicate whether it is valid in CF.
	RDSEED,

	// SSE42 string comparisons.
	PCMPISTRI,
	PCMPESTRI,

	// Test if in transactional execution.
	XTEST,

	// ERI instructions.
	RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2,

	// Conversions between float and half-float.
	CVTPS2PH, CVTPH2PS, CVTPH2PS_RND,

	// Galois Field Arithmetic Instructions
	GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB,

	// LWP insert record.
	LWPINS,

	// Compare and swap.
	LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
	LCMPXCHG8_DAG,
	LCMPXCHG16_DAG,
	LCMPXCHG8_SAVE_EBX_DAG,
	LCMPXCHG16_SAVE_RBX_DAG,

	/// LOCK-prefixed arithmetic read-modify-write instructions.
	/// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
	LADD, LSUB, LOR, LXOR, LAND, LINC, LDEC,

	// Load, scalar_to_vector, and zero extend.
	VZEXT_LOAD,

	// Store FP control world into i16 memory.
	FNSTCW16m,

	/// This instruction implements FP_TO_SINT with the
	/// integer destination in memory and a FP reg source. This corresponds
	/// to the X86::FIST*m instructions and the rounding mode change stuff. It
	/// has two inputs (token chain and address) and two outputs (int value
	/// and token chain).
	FP_TO_INT16_IN_MEM,
	FP_TO_INT32_IN_MEM,
	FP_TO_INT64_IN_MEM,

	/// This instruction implements SINT_TO_FP with the
	/// integer source in memory and FP reg result. This corresponds to the
	/// X86::FILD*m instructions. It has three inputs (token chain, address,
	/// and source type) and two outputs (FP value and token chain). FILD_FLAG
	/// also produces a flag).
	FILD,
	FILD_FLAG,

	/// This instruction implements an extending load to FP stack slots.
	/// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
	/// operand, ptr to load from, and a ValueType node indicating the type
	/// to load to.
	FLD,

	/// This instruction implements a truncating store to FP stack
	/// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
	/// chain operand, value to store, address, and a ValueType to store it
	/// as.
	FST,

	/// This instruction grabs the address of the next argument
	/// from a va_list. (reads and modifies the va_list in memory)
	VAARG_64,

	// Vector truncating store with unsigned/signed saturation
	VTRUNCSTOREUS, VTRUNCSTORES,
	// Vector truncating masked store with unsigned/signed saturation
	VMTRUNCSTOREUS, VMTRUNCSTORES,

	// X86 specific gather and scatter
	MGATHER, MSCATTER,

	// WARNING: Do not add anything in the end unless you want the node to
	// have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
	// opcodes will be thought as target memory ops!
	};
	} // end namespace X86ISD

	/// Define some predicates that are used for node matching.
	namespace X86 {
	/// Returns true if Elt is a constant zero or floating point constant +0.0.
	bool isZeroNode(SDValue Elt);

	/// Returns true of the given offset can be
	/// fit into displacement field of the instruction.
	bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
	bool hasSymbolicDisplacement = true);

	/// Determines whether the callee is required to pop its
	/// own arguments. Callee pop is necessary to support tail calls.
	bool isCalleePop(CallingConv::ID CallingConv,
	bool is64Bit, bool IsVarArg, bool GuaranteeTCO);

	} // end namespace X86

	//===--------------------------------------------------------------------===//
	// X86 Implementation of the TargetLowering interface
	class X86TargetLowering final : public TargetLowering {
	public:
	explicit X86TargetLowering(const X86TargetMachine &TM,
	const X86Subtarget &STI);

	unsigned getJumpTableEncoding() const override;
	bool useSoftFloat() const override;

	void markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const override;

	MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
	return MVT::i8;
	}

	const MCExpr *
	LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
	const MachineBasicBlock *MBB, unsigned uid,
	MCContext &Ctx) const override;

	/// Returns relocation base for the given PIC jumptable.
	SDValue getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const override;
	const MCExpr *
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
	unsigned JTI, MCContext &Ctx) const override;

	/// Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. For X86, aggregates
	/// that contains are placed at 16-byte boundaries while the rest are at
	/// 4-byte boundaries.
	unsigned getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const override;

	/// Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
	MachineFunction &MF) const override;

	/// Returns true if it's safe to use load / store of the
	/// specified type to expand memcpy / memset inline. This is mostly true
	/// for all types except for some special cases. For example, on X86
	/// targets without SSE2 f64 load / store are done with fldl / fstpl which
	/// also does type conversion. Note the specified type doesn't have to be
	/// legal as the hook is used before type legalization.
	bool isSafeMemOpType(MVT VT) const override;

	/// Returns true if the target allows unaligned memory accesses of the
	/// specified type. Returns whether it is "fast" in the last argument.
	bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
	bool *Fast) const override;

	/// Provide custom lowering hooks for some operations.
	///
	SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;

	/// Places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	void LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const override;

	/// Replace the results of node with an illegal result
	/// type with new values built out of custom code.
	///
	void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const override;

	SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;

	// Return true if it is profitable to combine a BUILD_VECTOR with a
	// stride-pattern to a shuffle and a truncate.
	// Example of such a combine:
	// v4i32 build_vector((extract_elt V, 1),
	// (extract_elt V, 3),
	// (extract_elt V, 5),
	// (extract_elt V, 7))
	// -->
	// v4i32 truncate (bitcast (shuffle<1,u,3,u,4,u,5,u,6,u,7,u> V, u) to
	// v4i64)
	bool isDesirableToCombineBuildVectorToShuffleTruncate(
	ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const override;

	/// Return true if the target has native support for
	/// the specified value type and it is 'desirable' to use the type for the
	/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
	/// instruction encodings are longer and some i16 instructions are slow.
	bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;

	/// Return true if the target has native support for the
	/// specified value type and it is 'desirable' to use the type. e.g. On x86
	/// i16 is legal, but undesirable since i16 instruction encodings are longer
	/// and some i16 instructions are slow.
	bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;

	MachineBasicBlock *
	EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const override;

	/// This method returns the name of a target specific DAG node.
	const char *getTargetNodeName(unsigned Opcode) const override;

	bool mergeStoresAfterLegalization() const override { return true; }

	bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
	const SelectionDAG &DAG) const override;

	bool isCheapToSpeculateCttz() const override;

	bool isCheapToSpeculateCtlz() const override;

	bool isCtlzFast() const override;

	bool hasBitPreservingFPLogic(EVT VT) const override {
	return VT == MVT::f32 \|\| VT == MVT::f64 \|\| VT.isVector();
	}

	bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
	// If the pair to store is a mixture of float and int values, we will
	// save two bitwise instructions and one float-to-int instruction and
	// increase one store instruction. There is potentially a more
	// significant benefit because it avoids the float->int domain switch
	// for input value. So It is more likely a win.
	if ((LTy.isFloatingPoint() && HTy.isInteger()) \|\|
	(LTy.isInteger() && HTy.isFloatingPoint()))
	return true;
	// If the pair only contains int values, we will save two bitwise
	// instructions and increase one store instruction (costing one more
	// store buffer). Since the benefit is more blurred so we leave
	// such pair out until we get testcase to prove it is a win.
	return false;
	}

	bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;

	bool hasAndNotCompare(SDValue Y) const override;

	bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
	return VT.isScalarInteger();
	}

	/// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
	MVT hasFastEqualityCompare(unsigned NumBits) const override;

	/// Return the value type to use for ISD::SETCC.
	EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
	EVT VT) const override;

	/// Determine which of the bits specified in Mask are known to be either
	/// zero or one and return them in the KnownZero/KnownOne bitsets.
	void computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const override;

	/// Determine the number of bits in the operation that are sign bits.
	unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const override;

	SDValue unwrapAddress(SDValue N) const override;

	bool isGAPlusOffset(SDNode N, const GlobalValue &GA,
	int64_t &Offset) const override;

	SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;

	bool ExpandInlineAsm(CallInst *CI) const override;

	ConstraintType getConstraintType(StringRef Constraint) const override;

	/// Examine constraint string and operand type and determine a weight value.
	/// The operand object must already have been set up with the operand type.
	ConstraintWeight
	getSingleConstraintMatchWeight(AsmOperandInfo &info,
	const char *constraint) const override;

	const char *LowerXConstraint(EVT ConstraintVT) const override;

	/// Lower the specified operand into the Ops vector. If it is invalid, don't
	/// add anything to Ops. If hasMemory is true it means one of the asm
	/// constraint of the inline asm instruction being processed is 'm'.
	void LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const override;

	unsigned
	getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
	if (ConstraintCode == "i")
	return InlineAsm::Constraint_i;
	else if (ConstraintCode == "o")
	return InlineAsm::Constraint_o;
	else if (ConstraintCode == "v")
	return InlineAsm::Constraint_v;
	else if (ConstraintCode == "X")
	return InlineAsm::Constraint_X;
	return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
	}

	/// Given a physical register constraint
	/// (e.g. {edx}), return the register number and the register class for the
	/// register. This should only be used for C_Register constraints. On
	/// error, this returns a register number of 0.
	std::pair<unsigned, const TargetRegisterClass *>
	getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint, MVT VT) const override;

	/// Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
	Type *Ty, unsigned AS,
	Instruction *I = nullptr) const override;

	/// Return true if the specified immediate is legal
	/// icmp immediate, that is the target has icmp instructions which can
	/// compare a register against the immediate without having to materialize
	/// the immediate into a register.
	bool isLegalICmpImmediate(int64_t Imm) const override;

	/// Return true if the specified immediate is legal
	/// add immediate, that is the target has add instructions which can
	/// add a register and the immediate without having to materialize
	/// the immediate into a register.
	bool isLegalAddImmediate(int64_t Imm) const override;

	/// \brief Return the cost of the scaling factor used in the addressing
	/// mode represented by AM for this target, for a load/store
	/// of the specified type.
	/// If the AM is supported, the return value must be >= 0.
	/// If the AM is not supported, it returns a negative value.
	int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
	unsigned AS) const override;

	bool isVectorShiftByScalarCheap(Type *Ty) const override;

	/// Return true if it's free to truncate a value of
	/// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
	/// register EAX to i16 by referencing its sub-register AX.
	bool isTruncateFree(Type Ty1, Type Ty2) const override;
	bool isTruncateFree(EVT VT1, EVT VT2) const override;

	bool allowTruncateForTailCall(Type Ty1, Type Ty2) const override;

	/// Return true if any actual instruction that defines a
	/// value of type Ty1 implicit zero-extends the value to Ty2 in the result
	/// register. This does not necessarily include registers defined in
	/// unknown ways, such as incoming arguments, or copies from unknown
	/// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
	/// does not necessarily apply to truncate instructions. e.g. on x86-64,
	/// all instructions that define 32-bit values implicit zero-extend the
	/// result out to 64 bits.
	bool isZExtFree(Type Ty1, Type Ty2) const override;
	bool isZExtFree(EVT VT1, EVT VT2) const override;
	bool isZExtFree(SDValue Val, EVT VT2) const override;

	/// Return true if folding a vector load into ExtVal (a sign, zero, or any
	/// extend node) is profitable.
	bool isVectorLoadExtDesirable(SDValue) const override;

	/// Return true if an FMA operation is faster than a pair of fmul and fadd
	/// instructions. fmuladd intrinsics will be expanded to FMAs when this
	/// method returns true, otherwise fmuladd is expanded to fmul + fadd.
	bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;

	/// Return true if it's profitable to narrow
	/// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
	/// from i32 to i8 but not from i32 to i16.
	bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;

	/// Given an intrinsic, checks if on the target the intrinsic will need to map
	/// to a MemIntrinsicNode (touches memory). If this is the case, it returns
	/// true and stores the intrinsic information into the IntrinsicInfo that was
	/// passed to the function.
	bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const override;

	/// Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
	/// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
	/// be legal.
	bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;

	/// Similar to isShuffleMaskLegal. This is used by Targets can use this to
	/// indicate if there is a suitable VECTOR_SHUFFLE that can be used to
	/// replace a VAND with a constant pool entry.
	bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
	EVT VT) const override;

	+ /// Returns true if lowering to a jump table is allowed.
	+ bool areJTsAllowed(const Function *Fn) const override;
	+
	/// If true, then instruction selection should
	/// seek to shrink the FP constant of the specified type to a smaller type
	/// in order to save space and / or reduce runtime.
	bool ShouldShrinkFPConstant(EVT VT) const override {
	// Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
	// expensive than a straight movsd. On the other hand, it's important to
	// shrink long double fp constant since fldt is very slow.
	return !X86ScalarSSEf64 \|\| VT == MVT::f80;
	}

	/// Return true if we believe it is correct and profitable to reduce the
	/// load node to a smaller type.
	bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
	EVT NewVT) const override;

	/// Return true if the specified scalar FP type is computed in an SSE
	/// register, not on the X87 floating point stack.
	bool isScalarFPTypeInSSEReg(EVT VT) const {
	return (VT == MVT::f64 && X86ScalarSSEf64) \|\| // f64 is when SSE2
	(VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
	}

	/// \brief Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const override;

	bool convertSelectOfConstantsToMath(EVT VT) const override;

	/// Return true if EXTRACT_SUBVECTOR is cheap for this result type
	/// with this index.
	bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const override;

	bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
	unsigned AddrSpace) const override {
	// If we can replace more than 2 scalar stores, there will be a reduction
	// in instructions even after we add a vector constant load.
	return NumElem > 2;
	}

	bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const override;

	/// Intel processors have a unified instruction and data cache
	const char * getClearCacheBuiltinName() const override {
	return nullptr; // nothing to do, move along.
	}

	unsigned getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const override;

	/// If a physical register, this returns the register that receives the
	/// exception address on entry to an EH pad.
	unsigned
	getExceptionPointerRegister(const Constant *PersonalityFn) const override;

	/// If a physical register, this returns the register that receives the
	/// exception typeid on entry to a landing pad.
	unsigned
	getExceptionSelectorRegister(const Constant *PersonalityFn) const override;

	virtual bool needsFixedCatchObjects() const override;

	/// This method returns a target specific FastISel object,
	/// or null if the target does not support "fast" ISel.
	FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const override;

	/// If the target has a standard location for the stack protector cookie,
	/// returns the address of that location. Otherwise, returns nullptr.
	Value *getIRStackGuard(IRBuilder<> &IRB) const override;

	bool useLoadStackGuardNode() const override;
	bool useStackGuardXorFP() const override;
	void insertSSPDeclarations(Module &M) const override;
	Value *getSDagStackGuard(const Module &M) const override;
	Value *getSSPStackGuardCheck(const Module &M) const override;
	SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
	const SDLoc &DL) const override;


	/// Return true if the target stores SafeStack pointer at a fixed offset in
	/// some non-standard address space, and populates the address space and
	/// offset as appropriate.
	Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;

	SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
	SelectionDAG &DAG) const;

	bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;

	/// \brief Customize the preferred legalization strategy for certain types.
	LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;

	bool isIntDivCheap(EVT VT, AttributeList Attr) const override;

	bool supportSwiftError() const override;

	StringRef getStackProbeSymbolName(MachineFunction &MF) const override;

	unsigned getMaxSupportedInterleaveFactor() const override { return 4; }

	/// \brief Lower interleaved load(s) into target specific
	/// instructions/intrinsics.
	bool lowerInterleavedLoad(LoadInst *LI,
	ArrayRef<ShuffleVectorInst *> Shuffles,
	ArrayRef<unsigned> Indices,
	unsigned Factor) const override;

	/// \brief Lower interleaved store(s) into target specific
	/// instructions/intrinsics.
	bool lowerInterleavedStore(StoreInst SI, ShuffleVectorInst SVI,
	unsigned Factor) const override;


	void finalizeLowering(MachineFunction &MF) const override;

	protected:
	std::pair<const TargetRegisterClass *, uint8_t>
	findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const override;

	private:
	/// Keep a reference to the X86Subtarget around so that we can
	/// make the right decision when generating code for different targets.
	const X86Subtarget &Subtarget;

	/// Select between SSE or x87 floating point ops.
	/// When SSE is available, use it for f32 operations.
	/// When SSE2 is available, use it for f64 operations.
	bool X86ScalarSSEf32;
	bool X86ScalarSSEf64;

	/// A list of legal FP immediates.
	std::vector<APFloat> LegalFPImmediates;

	/// Indicate that this x86 target can instruction
	/// select the specified FP immediate natively.
	void addLegalFPImmediate(const APFloat& Imm) {
	LegalFPImmediates.push_back(Imm);
	}

	SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
	CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals,
	uint32_t *RegMask) const;
	SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
	const SmallVectorImpl<ISD::InputArg> &ArgInfo,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA, MachineFrameInfo &MFI,
	unsigned i) const;
	SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags) const;

	// Call lowering helpers.

	/// Check whether the call is eligible for tail call optimization. Targets
	/// that want to do tail call optimization should implement this function.
	bool IsEligibleForTailCallOptimization(SDValue Callee,
	CallingConv::ID CalleeCC,
	bool isVarArg,
	bool isCalleeStructRet,
	bool isCallerStructRet,
	Type *RetTy,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SelectionDAG& DAG) const;
	SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
	SDValue Chain, bool IsTailCall,
	bool Is64Bit, int FPDiff,
	const SDLoc &dl) const;

	unsigned GetAlignedArgumentStackSize(unsigned StackSize,
	SelectionDAG &DAG) const;

	unsigned getAddressSpace(void) const;

	std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
	bool isSigned,
	bool isReplace) const;

	SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;

	unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr) const;
	SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl,
	int64_t Offset, SelectionDAG &DAG) const;
	SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;

	SDValue
	LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals) const override;
	SDValue LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const override;

	SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const override;

	bool supportSplitCSR(MachineFunction *MF) const override {
	return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
	MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
	}
	void initializeSplitCSR(MachineBasicBlock *Entry) const override;
	void insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;

	bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;

	bool mayBeEmittedAsTailCall(const CallInst *CI) const override;

	EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType ExtendKind) const override;

	bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const override;

	const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;

	TargetLoweringBase::AtomicExpansionKind
	shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
	bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
	TargetLoweringBase::AtomicExpansionKind
	shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;

	LoadInst *
	lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;

	bool needsCmpXchgNb(Type *MemType) const;

	void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB, int FI) const;

	// Utility function to emit the low-level va_arg code for X86-64.
	MachineBasicBlock *
	EmitVAARG64WithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	/// Utility function to emit the xmm reg save portion of va_start.
	MachineBasicBlock *
	EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
	MachineInstr &MI2,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr &I,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
	MachineBasicBlock *BB) const;
	+
	+ MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI,
	+ MachineBasicBlock *BB) const;

	MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	MachineBasicBlock *emitFMA3Instr(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	/// Emit nodes that will be selected as "test Op0,Op0", or something
	/// equivalent, for use with the given x86 condition code.
	SDValue EmitTest(SDValue Op0, unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG) const;

	/// Emit nodes that will be selected as "cmp Op0,Op1", or something
	/// equivalent, for use with the given x86 condition code.
	SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG) const;

	/// Convert a comparison if required by the subtarget.
	SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;

	/// Check if replacement of SQRT with RSQRT should be disabled.
	bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;

	/// Use rsqrt* to speed up sqrt calculations.
	SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &RefinementSteps, bool &UseOneConstNR,
	bool Reciprocal) const override;

	/// Use rcp* to speed up fdiv calculations.
	SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &RefinementSteps) const override;

	/// Reassociate floating point divisions into multiply by reciprocal.
	unsigned combineRepeatedFPDivisors() const override;
	};

	namespace X86 {
	FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo);
	} // end namespace X86

	// Base class for all X86 non-masked store operations.
	class X86StoreSDNode : public MemSDNode {
	public:
	X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
	SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	:MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
	const SDValue &getValue() const { return getOperand(1); }
	const SDValue &getBasePtr() const { return getOperand(2); }

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VTRUNCSTORES \|\|
	N->getOpcode() == X86ISD::VTRUNCSTOREUS;
	}
	};

	// Base class for all X86 masked store operations.
	// The class has the same order of operands as MaskedStoreSDNode for
	// convenience.
	class X86MaskedStoreSDNode : public MemSDNode {
	public:
	X86MaskedStoreSDNode(unsigned Opcode, unsigned Order,
	const DebugLoc &dl, SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	: MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}

	const SDValue &getBasePtr() const { return getOperand(1); }
	const SDValue &getMask() const { return getOperand(2); }
	const SDValue &getValue() const { return getOperand(3); }

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VMTRUNCSTORES \|\|
	N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
	}
	};

	// X86 Truncating Store with Signed saturation.
	class TruncSStoreSDNode : public X86StoreSDNode {
	public:
	TruncSStoreSDNode(unsigned Order, const DebugLoc &dl,
	SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
	: X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VTRUNCSTORES;
	}
	};

	// X86 Truncating Store with Unsigned saturation.
	class TruncUSStoreSDNode : public X86StoreSDNode {
	public:
	TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl,
	SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
	: X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VTRUNCSTOREUS;
	}
	};

	// X86 Truncating Masked Store with Signed saturation.
	class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode {
	public:
	MaskedTruncSStoreSDNode(unsigned Order,
	const DebugLoc &dl, SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	: X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VMTRUNCSTORES;
	}
	};

	// X86 Truncating Masked Store with Unsigned saturation.
	class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode {
	public:
	MaskedTruncUSStoreSDNode(unsigned Order,
	const DebugLoc &dl, SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	: X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
	}
	};

	// X86 specific Gather/Scatter nodes.
	// The class has the same order of operands as MaskedGatherScatterSDNode for
	// convenience.
	class X86MaskedGatherScatterSDNode : public MemSDNode {
	public:
	X86MaskedGatherScatterSDNode(unsigned Opc, unsigned Order,
	const DebugLoc &dl, SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	: MemSDNode(Opc, Order, dl, VTs, MemVT, MMO) {}

	const SDValue &getBasePtr() const { return getOperand(3); }
	const SDValue &getIndex() const { return getOperand(4); }
	const SDValue &getMask() const { return getOperand(2); }
	const SDValue &getValue() const { return getOperand(1); }

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::MGATHER \|\|
	N->getOpcode() == X86ISD::MSCATTER;
	}
	};

	class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
	public:
	X86MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
	EVT MemVT, MachineMemOperand *MMO)
	: X86MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT,
	MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::MGATHER;
	}
	};

	class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
	public:
	X86MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
	EVT MemVT, MachineMemOperand *MMO)
	: X86MaskedGatherScatterSDNode(X86ISD::MSCATTER, Order, dl, VTs, MemVT,
	MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::MSCATTER;
	}
	};

	/// Generate unpacklo/unpackhi shuffle mask.
	template <typename T = int>
	void createUnpackShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo,
	bool Unary) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	int NumElts = VT.getVectorNumElements();
	int NumEltsInLane = 128 / VT.getScalarSizeInBits();
	for (int i = 0; i < NumElts; ++i) {
	unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
	int Pos = (i % NumEltsInLane) / 2 + LaneStart;
	Pos += (Unary ? 0 : NumElts * (i % 2));
	Pos += (Lo ? 0 : NumEltsInLane / 2);
	Mask.push_back(Pos);
	}
	}

	/// Helper function to scale a shuffle or target shuffle mask, replacing each
	/// mask index with the scaled sequential indices for an equivalent narrowed
	/// mask. This is the reverse process to canWidenShuffleElements, but can
	/// always succeed.
	template <typename T>
	void scaleShuffleMask(int Scale, ArrayRef<T> Mask,
	SmallVectorImpl<T> &ScaledMask) {
	assert(0 < Scale && "Unexpected scaling factor");
	int NumElts = Mask.size();
	ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);

	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];

	// Repeat sentinel values in every mask element.
	if (M < 0) {
	for (int s = 0; s != Scale; ++s)
	ScaledMask[(Scale * i) + s] = M;
	continue;
	}

	// Scale mask element and increment across each mask element.
	for (int s = 0; s != Scale; ++s)
	ScaledMask[(Scale * i) + s] = (Scale * M) + s;
	}
	}
	} // end namespace llvm

	#endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
	Index: head/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86InstrCompiler.td (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86InstrCompiler.td (revision 328817)
	@@ -1,2001 +1,2009 @@
	//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -- tablegen --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the various pseudo instructions used by the compiler,
	// as well as Pat patterns used during instruction selection.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Pattern Matching Support

	def GetLo32XForm : SDNodeXForm<imm, [{
	// Transformation function: get the low 32 bits.
	return getI32Imm((unsigned)N->getZExtValue(), SDLoc(N));
	}]>;

	def GetLo8XForm : SDNodeXForm<imm, [{
	// Transformation function: get the low 8 bits.
	return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
	}]>;


	//===----------------------------------------------------------------------===//
	// Random Pseudo Instructions.

	// PIC base construction. This expands to code that looks like this:
	// call $next_inst
	// popl %destreg"
	let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP, SSP],
	SchedRW = [WriteJump] in
	def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
	"", [], IIC_CALL_RI>;


	// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
	// a stack adjustment and the codegen must know that they may modify the stack
	// pointer before prolog-epilog rewriting occurs.
	// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
	// sub / add which can clobber EFLAGS.
	let Defs = [ESP, EFLAGS, SSP], Uses = [ESP, SSP], SchedRW = [WriteALU] in {
	def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs),
	(ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
	"#ADJCALLSTACKDOWN", [], IIC_ALU_NONMEM>,
	Requires<[NotLP64]>;
	def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
	"#ADJCALLSTACKUP",
	[(X86callseq_end timm:$amt1, timm:$amt2)],
	IIC_ALU_NONMEM>, Requires<[NotLP64]>;
	}
	def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
	(ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>;


	// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
	// a stack adjustment and the codegen must know that they may modify the stack
	// pointer before prolog-epilog rewriting occurs.
	// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
	// sub / add which can clobber EFLAGS.
	let Defs = [RSP, EFLAGS, SSP], Uses = [RSP, SSP], SchedRW = [WriteALU] in {
	def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs),
	(ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
	"#ADJCALLSTACKDOWN",
	[], IIC_ALU_NONMEM>, Requires<[IsLP64]>;
	def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
	"#ADJCALLSTACKUP",
	[(X86callseq_end timm:$amt1, timm:$amt2)],
	IIC_ALU_NONMEM>, Requires<[IsLP64]>;
	}
	def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
	(ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>;

	let SchedRW = [WriteSystem] in {

	// x86-64 va_start lowering magic.
	let usesCustomInserter = 1, Defs = [EFLAGS] in {
	def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
	(outs),
	(ins GR8:$al,
	i64imm:$regsavefi, i64imm:$offset,
	variable_ops),
	"#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
	[(X86vastart_save_xmm_regs GR8:$al,
	imm:$regsavefi,
	imm:$offset),
	(implicit EFLAGS)]>;

	// The VAARG_64 pseudo-instruction takes the address of the va_list,
	// and places the address of the next argument into a register.
	let Defs = [EFLAGS] in
	def VAARG_64 : I<0, Pseudo,
	(outs GR64:$dst),
	(ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
	"#VAARG_64 $dst, $ap, $size, $mode, $align",
	[(set GR64:$dst,
	(X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
	(implicit EFLAGS)]>;


	// When using segmented stacks these are lowered into instructions which first
	// check if the current stacklet has enough free memory. If it does, memory is
	// allocated by bumping the stack pointer. Otherwise memory is allocated from
	// the heap.

	let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
	def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
	"# variable sized alloca for segmented stacks",
	[(set GR32:$dst,
	(X86SegAlloca GR32:$size))]>,
	Requires<[NotLP64]>;

	let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
	def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
	"# variable sized alloca for segmented stacks",
	[(set GR64:$dst,
	(X86SegAlloca GR64:$size))]>,
	Requires<[In64BitMode]>;
	}

	// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
	// targets. These calls are needed to probe the stack when allocating more than
	// 4k bytes in one go. Touching the stack at 4K increments is necessary to
	// ensure that the guard pages used by the OS virtual memory manager are
	// allocated in correct sequence.
	// The main point of having separate instruction are extra unmodelled effects
	// (compared to ordinary calls) like stack pointer change.

	let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
	def WIN_ALLOCA_32 : I<0, Pseudo, (outs), (ins GR32:$size),
	"# dynamic stack allocation",
	[(X86WinAlloca GR32:$size)]>,
	Requires<[NotLP64]>;

	let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
	def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size),
	"# dynamic stack allocation",
	[(X86WinAlloca GR64:$size)]>,
	Requires<[In64BitMode]>;
	} // SchedRW

	// These instructions XOR the frame pointer into a GPR. They are used in some
	// stack protection schemes. These are post-RA pseudos because we only know the
	// frame register after register allocation.
	let Constraints = "$src = $dst", isPseudo = 1, Defs = [EFLAGS] in {
	def XOR32_FP : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src),
	"xorl\t$$FP, $src", [], IIC_BIN_NONMEM>,
	Requires<[NotLP64]>, Sched<[WriteALU]>;
	def XOR64_FP : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src),
	"xorq\t$$FP $src", [], IIC_BIN_NONMEM>,
	Requires<[In64BitMode]>, Sched<[WriteALU]>;
	}

	//===----------------------------------------------------------------------===//
	// EH Pseudo Instructions
	//
	let SchedRW = [WriteSystem] in {
	let isTerminator = 1, isReturn = 1, isBarrier = 1,
	hasCtrlDep = 1, isCodeGenOnly = 1 in {
	def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
	"ret\t#eh_return, addr: $addr",
	[(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;

	}

	let isTerminator = 1, isReturn = 1, isBarrier = 1,
	hasCtrlDep = 1, isCodeGenOnly = 1 in {
	def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
	"ret\t#eh_return, addr: $addr",
	[(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;

	}

	let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
	isCodeGenOnly = 1, isReturn = 1 in {
	def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>;

	// CATCHRET needs a custom inserter for SEH.
	let usesCustomInserter = 1 in
	def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from),
	"# CATCHRET",
	[(catchret bb:$dst, bb:$from)]>;
	}

	let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
	usesCustomInserter = 1 in
	def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>;

	// This instruction is responsible for re-establishing stack pointers after an
	// exception has been caught and we are rejoining normal control flow in the
	// parent function or funclet. It generally sets ESP and EBP, and optionally
	// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us
	// elsewhere.
	let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in
	def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>;

	let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
	usesCustomInserter = 1 in {
	def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
	"#EH_SJLJ_SETJMP32",
	[(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
	Requires<[Not64BitMode]>;
	def EH_SjLj_SetJmp64 : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
	"#EH_SJLJ_SETJMP64",
	[(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
	Requires<[In64BitMode]>;
	let isTerminator = 1 in {
	def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
	"#EH_SJLJ_LONGJMP32",
	[(X86eh_sjlj_longjmp addr:$buf)]>,
	Requires<[Not64BitMode]>;
	def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
	"#EH_SJLJ_LONGJMP64",
	[(X86eh_sjlj_longjmp addr:$buf)]>,
	Requires<[In64BitMode]>;
	}
	}

	let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
	def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
	"#EH_SjLj_Setup\t$dst", []>;
	}
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// Pseudo instructions used by unwind info.
	//
	let isPseudo = 1, SchedRW = [WriteSystem] in {
	def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
	"#SEH_PushReg $reg", []>;
	def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
	"#SEH_SaveReg $reg, $dst", []>;
	def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
	"#SEH_SaveXMM $reg, $dst", []>;
	def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size),
	"#SEH_StackAlloc $size", []>;
	def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset),
	"#SEH_SetFrame $reg, $offset", []>;
	def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode),
	"#SEH_PushFrame $mode", []>;
	def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
	"#SEH_EndPrologue", []>;
	def SEH_Epilogue : I<0, Pseudo, (outs), (ins),
	"#SEH_Epilogue", []>;
	}

	//===----------------------------------------------------------------------===//
	// Pseudo instructions used by segmented stacks.
	//

	// This is lowered into a RET instruction by MCInstLower. We need
	// this so that we don't have to have a MachineBasicBlock which ends
	// with a RET and also has successors.
	let isPseudo = 1, SchedRW = [WriteJumpLd] in {
	def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
	"", [], IIC_RET>;

	// This instruction is lowered to a RET followed by a MOV. The two
	// instructions are not generated on a higher level since then the
	// verifier sees a MachineBasicBlock ending with a non-terminator.
	def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
	"", [], IIC_RET>;
	}

	//===----------------------------------------------------------------------===//
	// Alias Instructions
	//===----------------------------------------------------------------------===//

	// Alias instruction mapping movr0 to xor.
	// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
	let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
	isPseudo = 1, AddedComplexity = 10 in
	def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
	[(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;

	// Other widths can also make use of the 32-bit xor, which may have a smaller
	// encoding and avoid partial register updates.
	let AddedComplexity = 10 in {
	def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
	def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
	def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>;
	}

	let Predicates = [OptForSize, Not64BitMode],
	AddedComplexity = 10 in {
	let SchedRW = [WriteALU] in {
	// Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
	// which only require 3 bytes compared to MOV32ri which requires 5.
	let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
	def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
	[(set GR32:$dst, 1)], IIC_ALU_NONMEM>;
	def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
	[(set GR32:$dst, -1)], IIC_ALU_NONMEM>;
	}
	} // SchedRW

	// MOV16ri is 4 bytes, so the instructions above are smaller.
	def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>;
	def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
	}

	let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5,
	SchedRW = [WriteALU] in {
	// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
	def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
	[(set GR32:$dst, i32immSExt8:$src)], IIC_ALU_NONMEM>,
	Requires<[OptForMinSize, NotWin64WithoutFP]>;
	def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
	[(set GR64:$dst, i64immSExt8:$src)], IIC_ALU_NONMEM>,
	Requires<[OptForMinSize, NotWin64WithoutFP]>;
	}

	// Materialize i64 constant where top 32-bits are zero. This could theoretically
	// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
	// that would make it more difficult to rematerialize.
	let isReMaterializable = 1, isAsCheapAsAMove = 1,
	isPseudo = 1, hasSideEffects = 0, SchedRW = [WriteALU] in
	def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", [],
	IIC_ALU_NONMEM>;

	// This 64-bit pseudo-move can be used for both a 64-bit constant that is
	// actually the zero-extension of a 32-bit constant and for labels in the
	// x86-64 small code model.
	def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>;

	let AddedComplexity = 1 in
	def : Pat<(i64 mov64imm32:$src),
	(SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>;

	// Use sbb to materialize carry bit.
	let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
	// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
	// However, Pat<> can't replicate the destination reg into the inputs of the
	// result.
	def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
	[(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
	def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
	[(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
	def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
	[(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
	def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
	[(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
	} // isCodeGenOnly


	def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C16r)>;
	def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C32r)>;
	def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C64r)>;

	def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C16r)>;
	def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C32r)>;
	def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C64r)>;

	// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
	// will be eliminated and that the sbb can be extended up to a wider type. When
	// this happens, it is great. However, if we are left with an 8-bit sbb and an
	// and, we might as well just match it as a setb.
	def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
	(SETBr)>;

	// (add OP, SETB) -> (adc OP, 0)
	def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
	(ADC8ri GR8:$op, 0)>;
	def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
	(ADC32ri8 GR32:$op, 0)>;
	def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
	(ADC64ri8 GR64:$op, 0)>;

	// (sub OP, SETB) -> (sbb OP, 0)
	def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
	(SBB8ri GR8:$op, 0)>;
	def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
	(SBB32ri8 GR32:$op, 0)>;
	def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
	(SBB64ri8 GR64:$op, 0)>;

	// (sub OP, SETCC_CARRY) -> (adc OP, 0)
	def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
	(ADC8ri GR8:$op, 0)>;
	def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
	(ADC32ri8 GR32:$op, 0)>;
	def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
	(ADC64ri8 GR64:$op, 0)>;

	//===----------------------------------------------------------------------===//
	// String Pseudo Instructions
	//
	let SchedRW = [WriteMicrocoded] in {
	let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
	def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb\|rep movsb}",
	[(X86rep_movs i8)], IIC_REP_MOVS>, REP,
	Requires<[Not64BitMode]>;
	def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw\|rep movsw}",
	[(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
	Requires<[Not64BitMode]>;
	def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl\|rep movsd}",
	[(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
	Requires<[Not64BitMode]>;
	}

	let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
	def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb\|rep movsb}",
	[(X86rep_movs i8)], IIC_REP_MOVS>, REP,
	Requires<[In64BitMode]>;
	def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw\|rep movsw}",
	[(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
	Requires<[In64BitMode]>;
	def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl\|rep movsd}",
	[(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
	Requires<[In64BitMode]>;
	def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq\|rep movsq}",
	[(X86rep_movs i64)], IIC_REP_MOVS>, REP,
	Requires<[In64BitMode]>;
	}

	// FIXME: Should use "(X86rep_stos AL)" as the pattern.
	let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
	let Uses = [AL,ECX,EDI] in
	def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb\|rep stosb}",
	[(X86rep_stos i8)], IIC_REP_STOS>, REP,
	Requires<[Not64BitMode]>;
	let Uses = [AX,ECX,EDI] in
	def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw\|rep stosw}",
	[(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
	Requires<[Not64BitMode]>;
	let Uses = [EAX,ECX,EDI] in
	def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl\|rep stosd}",
	[(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
	Requires<[Not64BitMode]>;
	}

	let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
	let Uses = [AL,RCX,RDI] in
	def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb\|rep stosb}",
	[(X86rep_stos i8)], IIC_REP_STOS>, REP,
	Requires<[In64BitMode]>;
	let Uses = [AX,RCX,RDI] in
	def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw\|rep stosw}",
	[(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
	Requires<[In64BitMode]>;
	let Uses = [RAX,RCX,RDI] in
	def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl\|rep stosd}",
	[(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
	Requires<[In64BitMode]>;

	let Uses = [RAX,RCX,RDI] in
	def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq\|rep stosq}",
	[(X86rep_stos i64)], IIC_REP_STOS>, REP,
	Requires<[In64BitMode]>;
	}
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// Thread Local Storage Instructions
	//
	let SchedRW = [WriteSystem] in {

	// ELF TLS Support
	// All calls clobber the non-callee saved registers. ESP is marked as
	// a use to prevent stack-pointer assignments that appear immediately
	// before calls from potentially appearing dead.
	let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
	ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
	MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
	usesCustomInserter = 1, Uses = [ESP, SSP] in {
	def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
	"# TLS_addr32",
	[(X86tlsaddr tls32addr:$sym)]>,
	Requires<[Not64BitMode]>;
	def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
	"# TLS_base_addr32",
	[(X86tlsbaseaddr tls32baseaddr:$sym)]>,
	Requires<[Not64BitMode]>;
	}

	// All calls clobber the non-callee saved registers. RSP is marked as
	// a use to prevent stack-pointer assignments that appear immediately
	// before calls from potentially appearing dead.
	let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
	FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
	ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
	MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
	usesCustomInserter = 1, Uses = [RSP, SSP] in {
	def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
	"# TLS_addr64",
	[(X86tlsaddr tls64addr:$sym)]>,
	Requires<[In64BitMode]>;
	def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
	"# TLS_base_addr64",
	[(X86tlsbaseaddr tls64baseaddr:$sym)]>,
	Requires<[In64BitMode]>;
	}

	// Darwin TLS Support
	// For i386, the address of the thunk is passed on the stack, on return the
	// address of the variable is in %eax. %ecx is trashed during the function
	// call. All other registers are preserved.
	let Defs = [EAX, ECX, EFLAGS],
	Uses = [ESP, SSP],
	usesCustomInserter = 1 in
	def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
	"# TLSCall_32",
	[(X86TLSCall addr:$sym)]>,
	Requires<[Not64BitMode]>;

	// For x86_64, the address of the thunk is passed in %rdi, but the
	// pseudo directly use the symbol, so do not add an implicit use of
	// %rdi. The lowering will do the right thing with RDI.
	// On return the address of the variable is in %rax. All other
	// registers are preserved.
	let Defs = [RAX, EFLAGS],
	Uses = [RSP, SSP],
	usesCustomInserter = 1 in
	def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
	"# TLSCall_64",
	[(X86TLSCall addr:$sym)]>,
	Requires<[In64BitMode]>;
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// Conditional Move Pseudo Instructions

	// CMOV* - Used to implement the SELECT DAG operation. Expanded after
	// instruction selection into a branch sequence.
	multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> {
	def CMOV#NAME : I<0, Pseudo,
	(outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond),
	"#CMOV_"#NAME#" PSEUDO!",
	[(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond,
	EFLAGS)))]>;
	}

	let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
	// X86 doesn't have 8-bit conditional moves. Use a customInserter to
	// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
	// however that requires promoting the operands, and can induce additional
	// i8 register pressure.
	defm _GR8 : CMOVrr_PSEUDO<GR8, i8>;

	let Predicates = [NoCMov] in {
	defm _GR32 : CMOVrr_PSEUDO<GR32, i32>;
	defm _GR16 : CMOVrr_PSEUDO<GR16, i16>;
	} // Predicates = [NoCMov]

	// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
	// SSE1/SSE2.
	let Predicates = [FPStackf32] in
	defm _RFP32 : CMOVrr_PSEUDO<RFP32, f32>;

	let Predicates = [FPStackf64] in
	defm _RFP64 : CMOVrr_PSEUDO<RFP64, f64>;

	defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>;

	defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
	defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
	defm _FR128 : CMOVrr_PSEUDO<FR128, f128>;
	defm _V4F32 : CMOVrr_PSEUDO<VR128, v4f32>;
	defm _V2F64 : CMOVrr_PSEUDO<VR128, v2f64>;
	defm _V2I64 : CMOVrr_PSEUDO<VR128, v2i64>;
	defm _V8F32 : CMOVrr_PSEUDO<VR256, v8f32>;
	defm _V4F64 : CMOVrr_PSEUDO<VR256, v4f64>;
	defm _V4I64 : CMOVrr_PSEUDO<VR256, v4i64>;
	defm _V8I64 : CMOVrr_PSEUDO<VR512, v8i64>;
	defm _V8F64 : CMOVrr_PSEUDO<VR512, v8f64>;
	defm _V16F32 : CMOVrr_PSEUDO<VR512, v16f32>;
	defm _V8I1 : CMOVrr_PSEUDO<VK8, v8i1>;
	defm _V16I1 : CMOVrr_PSEUDO<VK16, v16i1>;
	defm _V32I1 : CMOVrr_PSEUDO<VK32, v32i1>;
	defm _V64I1 : CMOVrr_PSEUDO<VK64, v64i1>;
	} // usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS]

	//===----------------------------------------------------------------------===//
	// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
	//===----------------------------------------------------------------------===//

	// FIXME: Use normal instructions and add lock prefix dynamically.

	// Memory barriers

	// TODO: Get this to fold the constant into the instruction.
	let isCodeGenOnly = 1, Defs = [EFLAGS] in
	def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
	"or{l}\t{$zero, $dst\|$dst, $zero}", [],
	IIC_ALU_MEM>, Requires<[Not64BitMode]>, OpSize32, LOCK,
	Sched<[WriteALULd, WriteRMW]>;

	let hasSideEffects = 1 in
	def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
	"#MEMBARRIER",
	[(X86MemBarrier)]>, Sched<[WriteLoad]>;

	// RegOpc corresponds to the mr version of the instruction
	// ImmOpc corresponds to the mi version of the instruction
	// ImmOpc8 corresponds to the mi8 version of the instruction
	// ImmMod corresponds to the instruction format of the mi and mi8 versions
	multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
	Format ImmMod, SDNode Op, string mnemonic> {
	let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
	SchedRW = [WriteALULd, WriteRMW] in {

	def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
	RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
	MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
	!strconcat(mnemonic, "{b}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, GR8:$src2))],
	IIC_ALU_NONMEM>, LOCK;

	def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
	RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
	MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
	!strconcat(mnemonic, "{w}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, GR16:$src2))],
	IIC_ALU_NONMEM>, OpSize16, LOCK;

	def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
	RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
	MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
	!strconcat(mnemonic, "{l}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, GR32:$src2))],
	IIC_ALU_NONMEM>, OpSize32, LOCK;

	def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
	RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
	MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
	!strconcat(mnemonic, "{q}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, GR64:$src2))],
	IIC_ALU_NONMEM>, LOCK;

	def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
	ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
	ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
	!strconcat(mnemonic, "{b}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, (i8 imm:$src2)))],
	IIC_ALU_MEM>, LOCK;

	def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
	ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
	ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
	!strconcat(mnemonic, "{w}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, (i16 imm:$src2)))],
	IIC_ALU_MEM>, OpSize16, LOCK;

	def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
	ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
	ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
	!strconcat(mnemonic, "{l}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, (i32 imm:$src2)))],
	IIC_ALU_MEM>, OpSize32, LOCK;

	def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
	ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
	ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
	!strconcat(mnemonic, "{q}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))],
	IIC_ALU_MEM>, LOCK;

	def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
	ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
	ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
	!strconcat(mnemonic, "{w}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))],
	IIC_ALU_MEM>, OpSize16, LOCK;

	def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
	ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
	ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
	!strconcat(mnemonic, "{l}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))],
	IIC_ALU_MEM>, OpSize32, LOCK;

	def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
	ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
	ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
	!strconcat(mnemonic, "{q}\t",
	"{$src2, $dst\|$dst, $src2}"),
	[(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))],
	IIC_ALU_MEM>, LOCK;

	}

	}

	defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, X86lock_add, "add">;
	defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, X86lock_sub, "sub">;
	defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, X86lock_or , "or">;
	defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, X86lock_and, "and">;
	defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, X86lock_xor, "xor">;

	multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
	string frag, string mnemonic> {
	let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
	SchedRW = [WriteALULd, WriteRMW] in {
	def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst),
	!strconcat(mnemonic, "{b}\t$dst"),
	[(set EFLAGS, (!cast<PatFrag>(frag # "_8") addr:$dst))],
	IIC_UNARY_MEM>, LOCK;
	def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
	!strconcat(mnemonic, "{w}\t$dst"),
	[(set EFLAGS, (!cast<PatFrag>(frag # "_16") addr:$dst))],
	IIC_UNARY_MEM>, OpSize16, LOCK;
	def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
	!strconcat(mnemonic, "{l}\t$dst"),
	[(set EFLAGS, (!cast<PatFrag>(frag # "_32") addr:$dst))],
	IIC_UNARY_MEM>, OpSize32, LOCK;
	def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
	!strconcat(mnemonic, "{q}\t$dst"),
	[(set EFLAGS, (!cast<PatFrag>(frag # "_64") addr:$dst))],
	IIC_UNARY_MEM>, LOCK;
	}
	}

	multiclass unary_atomic_intrin<SDNode atomic_op> {
	def _8 : PatFrag<(ops node:$ptr),
	(atomic_op node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
	}]>;
	def _16 : PatFrag<(ops node:$ptr),
	(atomic_op node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
	}]>;
	def _32 : PatFrag<(ops node:$ptr),
	(atomic_op node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
	}]>;
	def _64 : PatFrag<(ops node:$ptr),
	(atomic_op node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
	}]>;
	}

	defm X86lock_inc : unary_atomic_intrin<X86lock_inc>;
	defm X86lock_dec : unary_atomic_intrin<X86lock_dec>;

	defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "X86lock_inc", "inc">;
	defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "X86lock_dec", "dec">;

	// Atomic compare and swap.
	multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
	SDPatternOperator frag, X86MemOperand x86memop,
	InstrItinClass itin> {
	let isCodeGenOnly = 1, usesCustomInserter = 1 in {
	def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
	!strconcat(mnemonic, "\t$ptr"),
	[(frag addr:$ptr)], itin>, TB, LOCK;
	}
	}

	multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
	string mnemonic, SDPatternOperator frag,
	InstrItinClass itin8, InstrItinClass itin> {
	let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
	let Defs = [AL, EFLAGS], Uses = [AL] in
	def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
	!strconcat(mnemonic, "{b}\t{$swap, $ptr\|$ptr, $swap}"),
	[(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
	let Defs = [AX, EFLAGS], Uses = [AX] in
	def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
	!strconcat(mnemonic, "{w}\t{$swap, $ptr\|$ptr, $swap}"),
	[(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK;
	let Defs = [EAX, EFLAGS], Uses = [EAX] in
	def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
	!strconcat(mnemonic, "{l}\t{$swap, $ptr\|$ptr, $swap}"),
	[(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK;
	let Defs = [RAX, EFLAGS], Uses = [RAX] in
	def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
	!strconcat(mnemonic, "{q}\t{$swap, $ptr\|$ptr, $swap}"),
	[(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
	}
	}

	let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
	SchedRW = [WriteALULd, WriteRMW] in {
	defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
	X86cas8, i64mem,
	IIC_CMPX_LOCK_8B>;
	}

	// This pseudo must be used when the frame uses RBX as
	// the base pointer. Indeed, in such situation RBX is a reserved
	// register and the register allocator will ignore any use/def of
	// it. In other words, the register will not fix the clobbering of
	// RBX that will happen when setting the arguments for the instrucion.
	//
	// Unlike the actual related instuction, we mark that this one
	// defines EBX (instead of using EBX).
	// The rationale is that we will define RBX during the expansion of
	// the pseudo. The argument feeding EBX is ebx_input.
	//
	// The additional argument, $ebx_save, is a temporary register used to
	// save the value of RBX across the actual instruction.
	//
	// To make sure the register assigned to $ebx_save does not interfere with
	// the definition of the actual instruction, we use a definition $dst which
	// is tied to $rbx_save. That way, the live-range of $rbx_save spans across
	// the instruction and we are sure we will have a valid register to restore
	// the value of RBX.
	let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX],
	SchedRW = [WriteALULd, WriteRMW], isCodeGenOnly = 1, isPseudo = 1,
	Constraints = "$ebx_save = $dst", usesCustomInserter = 1 in {
	def LCMPXCHG8B_SAVE_EBX :
	I<0, Pseudo, (outs GR32:$dst),
	(ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save),
	!strconcat("cmpxchg8b", "\t$ptr"),
	[(set GR32:$dst, (X86cas8save_ebx addr:$ptr, GR32:$ebx_input,
	GR32:$ebx_save))],
	IIC_CMPX_LOCK_8B>;
	}


	let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
	Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
	defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
	X86cas16, i128mem,
	IIC_CMPX_LOCK_16B>, REX_W;
	}

	// Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant.
	let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
	Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW],
	isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst",
	usesCustomInserter = 1 in {
	def LCMPXCHG16B_SAVE_RBX :
	I<0, Pseudo, (outs GR64:$dst),
	(ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save),
	!strconcat("cmpxchg16b", "\t$ptr"),
	[(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input,
	GR64:$rbx_save))],
	IIC_CMPX_LOCK_16B>;
	}

	defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
	X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;

	// Atomic exchange and add
	multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
	string frag,
	InstrItinClass itin8, InstrItinClass itin> {
	let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
	SchedRW = [WriteALULd, WriteRMW] in {
	def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst),
	(ins GR8:$val, i8mem:$ptr),
	!strconcat(mnemonic, "{b}\t{$val, $ptr\|$ptr, $val}"),
	[(set GR8:$dst,
	(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
	itin8>;
	def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
	(ins GR16:$val, i16mem:$ptr),
	!strconcat(mnemonic, "{w}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR16:$dst,
	(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
	itin>, OpSize16;
	def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
	(ins GR32:$val, i32mem:$ptr),
	!strconcat(mnemonic, "{l}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR32:$dst,
	(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
	itin>, OpSize32;
	def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
	(ins GR64:$val, i64mem:$ptr),
	!strconcat(mnemonic, "{q}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR64:$dst,
	(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
	itin>;
	}
	}

	defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
	IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
	TB, LOCK;

	/* The following multiclass tries to make sure that in code like
	* x.store (immediate op x.load(acquire), release)
	* and
	* x.store (register op x.load(acquire), release)
	* an operation directly on memory is generated instead of wasting a register.
	* It is not automatic as atomic_store/load are only lowered to MOV instructions
	* extremely late to prevent them from being accidentally reordered in the backend
	* (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
	*/
	multiclass RELEASE_BINOP_MI<SDNode op> {
	def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
	"#BINOP "#NAME#"8mi PSEUDO!",
	[(atomic_store_8 addr:$dst, (op
	(atomic_load_8 addr:$dst), (i8 imm:$src)))]>;
	def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src),
	"#BINOP "#NAME#"8mr PSEUDO!",
	[(atomic_store_8 addr:$dst, (op
	(atomic_load_8 addr:$dst), GR8:$src))]>;
	// NAME#16 is not generated as 16-bit arithmetic instructions are considered
	// costly and avoided as far as possible by this backend anyway
	def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
	"#BINOP "#NAME#"32mi PSEUDO!",
	[(atomic_store_32 addr:$dst, (op
	(atomic_load_32 addr:$dst), (i32 imm:$src)))]>;
	def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
	"#BINOP "#NAME#"32mr PSEUDO!",
	[(atomic_store_32 addr:$dst, (op
	(atomic_load_32 addr:$dst), GR32:$src))]>;
	def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
	"#BINOP "#NAME#"64mi32 PSEUDO!",
	[(atomic_store_64 addr:$dst, (op
	(atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>;
	def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
	"#BINOP "#NAME#"64mr PSEUDO!",
	[(atomic_store_64 addr:$dst, (op
	(atomic_load_64 addr:$dst), GR64:$src))]>;
	}
	let Defs = [EFLAGS], SchedRW = [WriteMicrocoded] in {
	defm RELEASE_ADD : RELEASE_BINOP_MI<add>;
	defm RELEASE_AND : RELEASE_BINOP_MI<and>;
	defm RELEASE_OR : RELEASE_BINOP_MI<or>;
	defm RELEASE_XOR : RELEASE_BINOP_MI<xor>;
	// Note: we don't deal with sub, because substractions of constants are
	// optimized into additions before this code can run.
	}

	// Same as above, but for floating-point.
	// FIXME: imm version.
	// FIXME: Version that doesn't clobber $src, using AVX's VADDSS.
	// FIXME: This could also handle SIMD operations with ps and pd instructions.
	let usesCustomInserter = 1, SchedRW = [WriteMicrocoded] in {
	multiclass RELEASE_FP_BINOP_MI<SDNode op> {
	def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src),
	"#BINOP "#NAME#"32mr PSEUDO!",
	[(atomic_store_32 addr:$dst,
	(i32 (bitconvert (op
	(f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))),
	FR32:$src))))]>, Requires<[HasSSE1]>;
	def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src),
	"#BINOP "#NAME#"64mr PSEUDO!",
	[(atomic_store_64 addr:$dst,
	(i64 (bitconvert (op
	(f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))),
	FR64:$src))))]>, Requires<[HasSSE2]>;
	}
	defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>;
	// FIXME: Add fsub, fmul, fdiv, ...
	}

	multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
	def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst),
	"#UNOP "#NAME#"8m PSEUDO!",
	[(atomic_store_8 addr:$dst, dag8)]>;
	def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst),
	"#UNOP "#NAME#"16m PSEUDO!",
	[(atomic_store_16 addr:$dst, dag16)]>;
	def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst),
	"#UNOP "#NAME#"32m PSEUDO!",
	[(atomic_store_32 addr:$dst, dag32)]>;
	def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst),
	"#UNOP "#NAME#"64m PSEUDO!",
	[(atomic_store_64 addr:$dst, dag64)]>;
	}

	let Defs = [EFLAGS], Predicates = [UseIncDec], SchedRW = [WriteMicrocoded] in {
	defm RELEASE_INC : RELEASE_UNOP<
	(add (atomic_load_8 addr:$dst), (i8 1)),
	(add (atomic_load_16 addr:$dst), (i16 1)),
	(add (atomic_load_32 addr:$dst), (i32 1)),
	(add (atomic_load_64 addr:$dst), (i64 1))>;
	defm RELEASE_DEC : RELEASE_UNOP<
	(add (atomic_load_8 addr:$dst), (i8 -1)),
	(add (atomic_load_16 addr:$dst), (i16 -1)),
	(add (atomic_load_32 addr:$dst), (i32 -1)),
	(add (atomic_load_64 addr:$dst), (i64 -1))>;
	}
	/*
	TODO: These don't work because the type inference of TableGen fails.
	TODO: find a way to fix it.
	let Defs = [EFLAGS] in {
	defm RELEASE_NEG : RELEASE_UNOP<
	(ineg (atomic_load_8 addr:$dst)),
	(ineg (atomic_load_16 addr:$dst)),
	(ineg (atomic_load_32 addr:$dst)),
	(ineg (atomic_load_64 addr:$dst))>;
	}
	// NOT doesn't set flags.
	defm RELEASE_NOT : RELEASE_UNOP<
	(not (atomic_load_8 addr:$dst)),
	(not (atomic_load_16 addr:$dst)),
	(not (atomic_load_32 addr:$dst)),
	(not (atomic_load_64 addr:$dst))>;
	*/

	let SchedRW = [WriteMicrocoded] in {
	def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
	"#RELEASE_MOV8mi PSEUDO!",
	[(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
	def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
	"#RELEASE_MOV16mi PSEUDO!",
	[(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
	def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
	"#RELEASE_MOV32mi PSEUDO!",
	[(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
	def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
	"#RELEASE_MOV64mi32 PSEUDO!",
	[(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;

	def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
	"#RELEASE_MOV8mr PSEUDO!",
	[(atomic_store_8 addr:$dst, GR8 :$src)]>;
	def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
	"#RELEASE_MOV16mr PSEUDO!",
	[(atomic_store_16 addr:$dst, GR16:$src)]>;
	def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
	"#RELEASE_MOV32mr PSEUDO!",
	[(atomic_store_32 addr:$dst, GR32:$src)]>;
	def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
	"#RELEASE_MOV64mr PSEUDO!",
	[(atomic_store_64 addr:$dst, GR64:$src)]>;

	def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
	"#ACQUIRE_MOV8rm PSEUDO!",
	[(set GR8:$dst, (atomic_load_8 addr:$src))]>;
	def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
	"#ACQUIRE_MOV16rm PSEUDO!",
	[(set GR16:$dst, (atomic_load_16 addr:$src))]>;
	def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
	"#ACQUIRE_MOV32rm PSEUDO!",
	[(set GR32:$dst, (atomic_load_32 addr:$src))]>;
	def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
	"#ACQUIRE_MOV64rm PSEUDO!",
	[(set GR64:$dst, (atomic_load_64 addr:$src))]>;
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// DAG Pattern Matching Rules
	//===----------------------------------------------------------------------===//

	// Use AND/OR to store 0/-1 in memory when optimizing for minsize. This saves
	// binary size compared to a regular MOV, but it introduces an unnecessary
	// load, so is not suitable for regular or optsize functions.
	let Predicates = [OptForMinSize] in {
	def : Pat<(store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>;
	def : Pat<(store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>;
	def : Pat<(store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>;
	def : Pat<(store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>;
	def : Pat<(store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>;
	def : Pat<(store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>;
	}

	// In kernel code model, we can get the address of a label
	// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of
	// the MOV64ri32 should accept these.
	def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
	(MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>;
	def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
	(MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>;
	def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
	(MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
	def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
	(MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
	def : Pat<(i64 (X86Wrapper mcsym:$dst)),
	(MOV64ri32 mcsym:$dst)>, Requires<[KernelCode]>;
	def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
	(MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;

	// If we have small model and -static mode, it is safe to store global addresses
	// directly as immediates. FIXME: This is really a hack, the 'imm' predicate
	// for MOV64mi32 should handle this sort of thing.
	def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
	(MOV64mi32 addr:$dst, tconstpool:$src)>,
	Requires<[NearData, IsNotPIC]>;
	def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
	(MOV64mi32 addr:$dst, tjumptable:$src)>,
	Requires<[NearData, IsNotPIC]>;
	def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
	(MOV64mi32 addr:$dst, tglobaladdr:$src)>,
	Requires<[NearData, IsNotPIC]>;
	def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
	(MOV64mi32 addr:$dst, texternalsym:$src)>,
	Requires<[NearData, IsNotPIC]>;
	def : Pat<(store (i64 (X86Wrapper mcsym:$src)), addr:$dst),
	(MOV64mi32 addr:$dst, mcsym:$src)>,
	Requires<[NearData, IsNotPIC]>;
	def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
	(MOV64mi32 addr:$dst, tblockaddress:$src)>,
	Requires<[NearData, IsNotPIC]>;

	def : Pat<(i32 (X86RecoverFrameAlloc mcsym:$dst)), (MOV32ri mcsym:$dst)>;
	def : Pat<(i64 (X86RecoverFrameAlloc mcsym:$dst)), (MOV64ri mcsym:$dst)>;

	// Calls

	// tls has some funny stuff here...
	// This corresponds to movabs $foo@tpoff, %rax
	def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
	(MOV64ri32 tglobaltlsaddr :$dst)>;
	// This corresponds to add $foo@tpoff, %rax
	def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
	(ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;


	// Direct PC relative function call for small code model. 32-bit displacement
	// sign extended to 64-bit.
	def : Pat<(X86call (i64 tglobaladdr:$dst)),
	(CALL64pcrel32 tglobaladdr:$dst)>;
	def : Pat<(X86call (i64 texternalsym:$dst)),
	(CALL64pcrel32 texternalsym:$dst)>;

	// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
	// can never use callee-saved registers. That is the purpose of the GR64_TC
	// register classes.
	//
	// The only volatile register that is never used by the calling convention is
	// %r11. This happens when calling a vararg function with 6 arguments.
	//
	// Match an X86tcret that uses less than 7 volatile registers.
	def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
	(X86tcret node:$ptr, node:$off), [{
	// X86tcret args: (*chain, ptr, imm, regs..., glue)
	unsigned NumRegs = 0;
	for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
	if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
	return false;
	return true;
	}]>;

	def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
	(TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
	- Requires<[Not64BitMode]>;
	+ Requires<[Not64BitMode, NotUseRetpoline]>;

	// FIXME: This is disabled for 32-bit PIC mode because the global base
	// register which is part of the address mode may be assigned a
	// callee-saved register.
	def : Pat<(X86tcret (load addr:$dst), imm:$off),
	(TCRETURNmi addr:$dst, imm:$off)>,
	- Requires<[Not64BitMode, IsNotPIC]>;
	+ Requires<[Not64BitMode, IsNotPIC, NotUseRetpoline]>;

	def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
	(TCRETURNdi tglobaladdr:$dst, imm:$off)>,
	Requires<[NotLP64]>;

	def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
	(TCRETURNdi texternalsym:$dst, imm:$off)>,
	Requires<[NotLP64]>;

	def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
	(TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
	- Requires<[In64BitMode]>;
	+ Requires<[In64BitMode, NotUseRetpoline]>;

	// Don't fold loads into X86tcret requiring more than 6 regs.
	// There wouldn't be enough scratch registers for base+index.
	def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
	(TCRETURNmi64 addr:$dst, imm:$off)>,
	- Requires<[In64BitMode]>;
	+ Requires<[In64BitMode, NotUseRetpoline]>;
	+
	+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
	+ (RETPOLINE_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>,
	+ Requires<[In64BitMode, UseRetpoline]>;
	+
	+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
	+ (RETPOLINE_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>,
	+ Requires<[Not64BitMode, UseRetpoline]>;

	def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
	(TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
	Requires<[IsLP64]>;

	def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
	(TCRETURNdi64 texternalsym:$dst, imm:$off)>,
	Requires<[IsLP64]>;

	// Normal calls, with various flavors of addresses.
	def : Pat<(X86call (i32 tglobaladdr:$dst)),
	(CALLpcrel32 tglobaladdr:$dst)>;
	def : Pat<(X86call (i32 texternalsym:$dst)),
	(CALLpcrel32 texternalsym:$dst)>;
	def : Pat<(X86call (i32 imm:$dst)),
	(CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;

	// Comparisons.

	// TEST R,R is smaller than CMP R,0
	def : Pat<(X86cmp GR8:$src1, 0),
	(TEST8rr GR8:$src1, GR8:$src1)>;
	def : Pat<(X86cmp GR16:$src1, 0),
	(TEST16rr GR16:$src1, GR16:$src1)>;
	def : Pat<(X86cmp GR32:$src1, 0),
	(TEST32rr GR32:$src1, GR32:$src1)>;
	def : Pat<(X86cmp GR64:$src1, 0),
	(TEST64rr GR64:$src1, GR64:$src1)>;

	// Conditional moves with folded loads with operands swapped and conditions
	// inverted.
	multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
	Instruction Inst64> {
	let Predicates = [HasCMov] in {
	def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
	(Inst16 GR16:$src2, addr:$src1)>;
	def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
	(Inst32 GR32:$src2, addr:$src1)>;
	def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
	(Inst64 GR64:$src2, addr:$src1)>;
	}
	}

	defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
	defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>;
	defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>;
	defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>;
	defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>;
	defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>;
	defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>;
	defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>;
	defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>;
	defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>;
	defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>;
	defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>;
	defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>;
	defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>;
	defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
	defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;

	// zextload bool -> zextload byte
	// i1 stored in one byte in zero-extended form.
	// Upper bits cleanup should be executed before Store.
	def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>;
	def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
	def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
	def : Pat<(zextloadi64i1 addr:$src),
	(SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;

	// extload bool -> extload byte
	// When extloading from 16-bit and smaller memory locations into 64-bit
	// registers, use zero-extending loads so that the entire 64-bit register is
	// defined, avoiding partial-register updates.

	def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>;
	def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
	def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
	def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
	def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>;
	def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;

	// For other extloads, use subregs, since the high contents of the register are
	// defined after an extload.
	def : Pat<(extloadi64i1 addr:$src),
	(SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
	def : Pat<(extloadi64i8 addr:$src),
	(SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
	def : Pat<(extloadi64i16 addr:$src),
	(SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
	def : Pat<(extloadi64i32 addr:$src),
	(SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;

	// anyext. Define these to do an explicit zero-extend to
	// avoid partial-register updates.
	def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
	(MOVZX32rr8 GR8 :$src), sub_16bit)>;
	def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>;

	// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
	def : Pat<(i32 (anyext GR16:$src)),
	(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;

	def : Pat<(i64 (anyext GR8 :$src)),
	(SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>;
	def : Pat<(i64 (anyext GR16:$src)),
	(SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
	def : Pat<(i64 (anyext GR32:$src)),
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, sub_32bit)>;


	// Any instruction that defines a 32-bit result leaves the high half of the
	// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
	// be copying from a truncate. Any other 32-bit operation will zero-extend
	// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper
	// 32 bits, they're probably just qualifying a CopyFromReg.
	def def32 : PatLeaf<(i32 GR32:$src), [{
	return N->getOpcode() != ISD::TRUNCATE &&
	N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
	N->getOpcode() != ISD::CopyFromReg &&
	N->getOpcode() != ISD::AssertSext &&
	N->getOpcode() != ISD::AssertZext;
	}]>;

	// In the case of a 32-bit def that is known to implicitly zero-extend,
	// we can use a SUBREG_TO_REG.
	def : Pat<(i64 (zext def32:$src)),
	(SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;

	//===----------------------------------------------------------------------===//
	// Pattern match OR as ADD
	//===----------------------------------------------------------------------===//

	// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
	// 3-addressified into an LEA instruction to avoid copies. However, we also
	// want to finally emit these instructions as an or at the end of the code
	// generator to make the generated code easier to read. To do this, we select
	// into "disjoint bits" pseudo ops.

	// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
	def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
	return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());

	KnownBits Known0;
	CurDAG->computeKnownBits(N->getOperand(0), Known0, 0);
	KnownBits Known1;
	CurDAG->computeKnownBits(N->getOperand(1), Known1, 0);
	return (~Known0.Zero & ~Known1.Zero) == 0;
	}]>;


	// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
	// Try this before the selecting to OR.
	let AddedComplexity = 5, SchedRW = [WriteALU] in {

	let isConvertibleToThreeAddress = 1,
	Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
	let isCommutable = 1 in {
	def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
	"", // orw/addw REG, REG
	[(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
	def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
	"", // orl/addl REG, REG
	[(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
	def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
	"", // orq/addq REG, REG
	[(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
	} // isCommutable

	// NOTE: These are order specific, we want the ri8 forms to be listed
	// first so that they are slightly preferred to the ri forms.

	def ADD16ri8_DB : I<0, Pseudo,
	(outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
	"", // orw/addw REG, imm8
	[(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>;
	def ADD16ri_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
	"", // orw/addw REG, imm
	[(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>;

	def ADD32ri8_DB : I<0, Pseudo,
	(outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
	"", // orl/addl REG, imm8
	[(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>;
	def ADD32ri_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
	"", // orl/addl REG, imm
	[(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>;


	def ADD64ri8_DB : I<0, Pseudo,
	(outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
	"", // orq/addq REG, imm8
	[(set GR64:$dst, (or_is_add GR64:$src1,
	i64immSExt8:$src2))]>;
	def ADD64ri32_DB : I<0, Pseudo,
	(outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
	"", // orq/addq REG, imm
	[(set GR64:$dst, (or_is_add GR64:$src1,
	i64immSExt32:$src2))]>;
	}
	} // AddedComplexity, SchedRW


	//===----------------------------------------------------------------------===//
	// Some peepholes
	//===----------------------------------------------------------------------===//

	// Odd encoding trick: -128 fits into an 8-bit immediate field while
	// +128 doesn't, so in this special case use a sub instead of an add.
	def : Pat<(add GR16:$src1, 128),
	(SUB16ri8 GR16:$src1, -128)>;
	def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
	(SUB16mi8 addr:$dst, -128)>;

	def : Pat<(add GR32:$src1, 128),
	(SUB32ri8 GR32:$src1, -128)>;
	def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
	(SUB32mi8 addr:$dst, -128)>;

	def : Pat<(add GR64:$src1, 128),
	(SUB64ri8 GR64:$src1, -128)>;
	def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
	(SUB64mi8 addr:$dst, -128)>;

	// The same trick applies for 32-bit immediate fields in 64-bit
	// instructions.
	def : Pat<(add GR64:$src1, 0x0000000080000000),
	(SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
	def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst),
	(SUB64mi32 addr:$dst, 0xffffffff80000000)>;

	// To avoid needing to materialize an immediate in a register, use a 32-bit and
	// with implicit zero-extension instead of a 64-bit and if the immediate has at
	// least 32 bits of leading zeros. If in addition the last 32 bits can be
	// represented with a sign extension of a 8 bit constant, use that.
	// This can also reduce instruction size by eliminating the need for the REX
	// prefix.

	// AddedComplexity is needed to give priority over i64immSExt8 and i64immSExt32.
	let AddedComplexity = 1 in {
	def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
	(SUBREG_TO_REG
	(i64 0),
	(AND32ri8
	(EXTRACT_SUBREG GR64:$src, sub_32bit),
	(i32 (GetLo8XForm imm:$imm))),
	sub_32bit)>;

	def : Pat<(and GR64:$src, i64immZExt32:$imm),
	(SUBREG_TO_REG
	(i64 0),
	(AND32ri
	(EXTRACT_SUBREG GR64:$src, sub_32bit),
	(i32 (GetLo32XForm imm:$imm))),
	sub_32bit)>;
	} // AddedComplexity = 1


	// AddedComplexity is needed due to the increased complexity on the
	// i64immZExt32SExt8 and i64immZExt32 patterns above. Applying this to all
	// the MOVZX patterns keeps thems together in DAGIsel tables.
	let AddedComplexity = 1 in {
	// r & (2^16-1) ==> movz
	def : Pat<(and GR32:$src1, 0xffff),
	(MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
	// r & (2^8-1) ==> movz
	def : Pat<(and GR32:$src1, 0xff),
	(MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>;
	// r & (2^8-1) ==> movz
	def : Pat<(and GR16:$src1, 0xff),
	(EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG GR16:$src1, sub_8bit)),
	sub_16bit)>;

	// r & (2^32-1) ==> movz
	def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
	(SUBREG_TO_REG (i64 0),
	(MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
	sub_32bit)>;
	// r & (2^16-1) ==> movz
	def : Pat<(and GR64:$src, 0xffff),
	(SUBREG_TO_REG (i64 0),
	(MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
	sub_32bit)>;
	// r & (2^8-1) ==> movz
	def : Pat<(and GR64:$src, 0xff),
	(SUBREG_TO_REG (i64 0),
	(MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
	sub_32bit)>;
	} // AddedComplexity = 1


	// sext_inreg patterns
	def : Pat<(sext_inreg GR32:$src, i16),
	(MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
	def : Pat<(sext_inreg GR32:$src, i8),
	(MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>;

	def : Pat<(sext_inreg GR16:$src, i8),
	(EXTRACT_SUBREG (MOVSX32rr8 (EXTRACT_SUBREG GR16:$src, sub_8bit)),
	sub_16bit)>;

	def : Pat<(sext_inreg GR64:$src, i32),
	(MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
	def : Pat<(sext_inreg GR64:$src, i16),
	(MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
	def : Pat<(sext_inreg GR64:$src, i8),
	(MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;

	// sext, sext_load, zext, zext_load
	def: Pat<(i16 (sext GR8:$src)),
	(EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
	def: Pat<(sextloadi16i8 addr:$src),
	(EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
	def: Pat<(i16 (zext GR8:$src)),
	(EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
	def: Pat<(zextloadi16i8 addr:$src),
	(EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;

	// trunc patterns
	def : Pat<(i16 (trunc GR32:$src)),
	(EXTRACT_SUBREG GR32:$src, sub_16bit)>;
	def : Pat<(i8 (trunc GR32:$src)),
	(EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
	sub_8bit)>,
	Requires<[Not64BitMode]>;
	def : Pat<(i8 (trunc GR16:$src)),
	(EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
	sub_8bit)>,
	Requires<[Not64BitMode]>;
	def : Pat<(i32 (trunc GR64:$src)),
	(EXTRACT_SUBREG GR64:$src, sub_32bit)>;
	def : Pat<(i16 (trunc GR64:$src)),
	(EXTRACT_SUBREG GR64:$src, sub_16bit)>;
	def : Pat<(i8 (trunc GR64:$src)),
	(EXTRACT_SUBREG GR64:$src, sub_8bit)>;
	def : Pat<(i8 (trunc GR32:$src)),
	(EXTRACT_SUBREG GR32:$src, sub_8bit)>,
	Requires<[In64BitMode]>;
	def : Pat<(i8 (trunc GR16:$src)),
	(EXTRACT_SUBREG GR16:$src, sub_8bit)>,
	Requires<[In64BitMode]>;

	// h-register tricks
	def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
	(EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>,
	Requires<[Not64BitMode]>;
	def : Pat<(i8 (trunc (srl_su (i32 (anyext GR16:$src)), (i8 8)))),
	(EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>,
	Requires<[Not64BitMode]>;
	def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
	(EXTRACT_SUBREG GR32:$src, sub_8bit_hi)>,
	Requires<[Not64BitMode]>;
	def : Pat<(srl GR16:$src, (i8 8)),
	(EXTRACT_SUBREG
	(MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
	sub_16bit)>;
	def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
	(MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
	def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
	(MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
	def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
	(MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
	def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
	(MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;

	// h-register tricks.
	// For now, be conservative on x86-64 and use an h-register extract only if the
	// value is immediately zero-extended or stored, which are somewhat common
	// cases. This uses a bunch of code to prevent a register requiring a REX prefix
	// from being allocated in the same instruction as the h register, as there's
	// currently no way to describe this requirement to the register allocator.

	// h-register extract and zero-extend.
	def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
	(SUBREG_TO_REG
	(i64 0),
	(MOVZX32_NOREXrr8
	(EXTRACT_SUBREG GR64:$src, sub_8bit_hi)),
	sub_32bit)>;
	def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
	(SUBREG_TO_REG
	(i64 0),
	(MOVZX32_NOREXrr8
	(EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
	sub_32bit)>;
	def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
	(SUBREG_TO_REG
	(i64 0),
	(MOVZX32_NOREXrr8
	(EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
	sub_32bit)>;

	// h-register extract and store.
	def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
	(MOV8mr_NOREX
	addr:$dst,
	(EXTRACT_SUBREG GR64:$src, sub_8bit_hi))>;
	def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
	(MOV8mr_NOREX
	addr:$dst,
	(EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>,
	Requires<[In64BitMode]>;
	def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
	(MOV8mr_NOREX
	addr:$dst,
	(EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>,
	Requires<[In64BitMode]>;


	// (shl x, 1) ==> (add x, x)
	// Note that if x is undef (immediate or otherwise), we could theoretically
	// end up with the two uses of x getting different values, producing a result
	// where the least significant bit is not 0. However, the probability of this
	// happening is considered low enough that this is officially not a
	// "real problem".
	def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>;
	def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
	def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
	def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;

	// Helper imms to check if a mask doesn't change significant shift/rotate bits.
	def immShift8 : ImmLeaf<i8, [{
	return countTrailingOnes<uint64_t>(Imm) >= 3;
	}]>;
	def immShift16 : ImmLeaf<i8, [{
	return countTrailingOnes<uint64_t>(Imm) >= 4;
	}]>;
	def immShift32 : ImmLeaf<i8, [{
	return countTrailingOnes<uint64_t>(Imm) >= 5;
	}]>;
	def immShift64 : ImmLeaf<i8, [{
	return countTrailingOnes<uint64_t>(Imm) >= 6;
	}]>;

	// Shift amount is implicitly masked.
	multiclass MaskedShiftAmountPats<SDNode frag, string name> {
	// (shift x (and y, 31)) ==> (shift x, y)
	def : Pat<(frag GR8:$src1, (and CL, immShift32)),
	(!cast<Instruction>(name # "8rCL") GR8:$src1)>;
	def : Pat<(frag GR16:$src1, (and CL, immShift32)),
	(!cast<Instruction>(name # "16rCL") GR16:$src1)>;
	def : Pat<(frag GR32:$src1, (and CL, immShift32)),
	(!cast<Instruction>(name # "32rCL") GR32:$src1)>;
	def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
	(!cast<Instruction>(name # "8mCL") addr:$dst)>;
	def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
	(!cast<Instruction>(name # "16mCL") addr:$dst)>;
	def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
	(!cast<Instruction>(name # "32mCL") addr:$dst)>;

	// (shift x (and y, 63)) ==> (shift x, y)
	def : Pat<(frag GR64:$src1, (and CL, immShift64)),
	(!cast<Instruction>(name # "64rCL") GR64:$src1)>;
	def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst),
	(!cast<Instruction>(name # "64mCL") addr:$dst)>;
	}

	defm : MaskedShiftAmountPats<shl, "SHL">;
	defm : MaskedShiftAmountPats<srl, "SHR">;
	defm : MaskedShiftAmountPats<sra, "SAR">;

	// ROL/ROR instructions allow a stronger mask optimization than shift for 8- and
	// 16-bit. We can remove a mask of any (bitwidth - 1) on the rotation amount
	// because over-rotating produces the same result. This is noted in the Intel
	// docs with: "tempCOUNT <- (COUNT & COUNTMASK) MOD SIZE". Masking the rotation
	// amount could affect EFLAGS results, but that does not matter because we are
	// not tracking flags for these nodes.
	multiclass MaskedRotateAmountPats<SDNode frag, string name> {
	// (rot x (and y, BitWidth - 1)) ==> (rot x, y)
	def : Pat<(frag GR8:$src1, (and CL, immShift8)),
	(!cast<Instruction>(name # "8rCL") GR8:$src1)>;
	def : Pat<(frag GR16:$src1, (and CL, immShift16)),
	(!cast<Instruction>(name # "16rCL") GR16:$src1)>;
	def : Pat<(frag GR32:$src1, (and CL, immShift32)),
	(!cast<Instruction>(name # "32rCL") GR32:$src1)>;
	def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift8)), addr:$dst),
	(!cast<Instruction>(name # "8mCL") addr:$dst)>;
	def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift16)), addr:$dst),
	(!cast<Instruction>(name # "16mCL") addr:$dst)>;
	def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
	(!cast<Instruction>(name # "32mCL") addr:$dst)>;

	// (rot x (and y, 63)) ==> (rot x, y)
	def : Pat<(frag GR64:$src1, (and CL, immShift64)),
	(!cast<Instruction>(name # "64rCL") GR64:$src1)>;
	def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst),
	(!cast<Instruction>(name # "64mCL") addr:$dst)>;
	}


	defm : MaskedRotateAmountPats<rotl, "ROL">;
	defm : MaskedRotateAmountPats<rotr, "ROR">;

	// Double shift amount is implicitly masked.
	multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> {
	// (shift x (and y, 31)) ==> (shift x, y)
	def : Pat<(frag GR16:$src1, GR16:$src2, (and CL, immShift32)),
	(!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>;
	def : Pat<(frag GR32:$src1, GR32:$src2, (and CL, immShift32)),
	(!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>;

	// (shift x (and y, 63)) ==> (shift x, y)
	def : Pat<(frag GR64:$src1, GR64:$src2, (and CL, immShift64)),
	(!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>;
	}

	defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">;
	defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">;

	let Predicates = [HasBMI2] in {
	let AddedComplexity = 1 in {
	def : Pat<(sra GR32:$src1, (and GR8:$src2, immShift32)),
	(SARX32rr GR32:$src1,
	(INSERT_SUBREG
	(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(sra GR64:$src1, (and GR8:$src2, immShift64)),
	(SARX64rr GR64:$src1,
	(INSERT_SUBREG
	(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;

	def : Pat<(srl GR32:$src1, (and GR8:$src2, immShift32)),
	(SHRX32rr GR32:$src1,
	(INSERT_SUBREG
	(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(srl GR64:$src1, (and GR8:$src2, immShift64)),
	(SHRX64rr GR64:$src1,
	(INSERT_SUBREG
	(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;

	def : Pat<(shl GR32:$src1, (and GR8:$src2, immShift32)),
	(SHLX32rr GR32:$src1,
	(INSERT_SUBREG
	(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(shl GR64:$src1, (and GR8:$src2, immShift64)),
	(SHLX64rr GR64:$src1,
	(INSERT_SUBREG
	(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	}

	let AddedComplexity = -20 in {
	def : Pat<(sra (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
	(SARX32rm addr:$src1,
	(INSERT_SUBREG
	(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(sra (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
	(SARX64rm addr:$src1,
	(INSERT_SUBREG
	(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;

	def : Pat<(srl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
	(SHRX32rm addr:$src1,
	(INSERT_SUBREG
	(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(srl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
	(SHRX64rm addr:$src1,
	(INSERT_SUBREG
	(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;

	def : Pat<(shl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
	(SHLX32rm addr:$src1,
	(INSERT_SUBREG
	(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	def : Pat<(shl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
	(SHLX64rm addr:$src1,
	(INSERT_SUBREG
	(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
	}
	}

	// (anyext (setcc_carry)) -> (setcc_carry)
	def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C16r)>;
	def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C32r)>;
	def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
	(SETB_C32r)>;

	//===----------------------------------------------------------------------===//
	// EFLAGS-defining Patterns
	//===----------------------------------------------------------------------===//

	// add reg, reg
	def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>;
	def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
	def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;

	// add reg, mem
	def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
	(ADD8rm GR8:$src1, addr:$src2)>;
	def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
	(ADD16rm GR16:$src1, addr:$src2)>;
	def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
	(ADD32rm GR32:$src1, addr:$src2)>;

	// add reg, imm
	def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>;
	def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
	def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
	def : Pat<(add GR16:$src1, i16immSExt8:$src2),
	(ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
	def : Pat<(add GR32:$src1, i32immSExt8:$src2),
	(ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;

	// sub reg, reg
	def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>;
	def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
	def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;

	// sub reg, mem
	def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
	(SUB8rm GR8:$src1, addr:$src2)>;
	def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
	(SUB16rm GR16:$src1, addr:$src2)>;
	def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
	(SUB32rm GR32:$src1, addr:$src2)>;

	// sub reg, imm
	def : Pat<(sub GR8:$src1, imm:$src2),
	(SUB8ri GR8:$src1, imm:$src2)>;
	def : Pat<(sub GR16:$src1, imm:$src2),
	(SUB16ri GR16:$src1, imm:$src2)>;
	def : Pat<(sub GR32:$src1, imm:$src2),
	(SUB32ri GR32:$src1, imm:$src2)>;
	def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
	(SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
	def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
	(SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;

	// sub 0, reg
	def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>;
	def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
	def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
	def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;

	// sub reg, relocImm
	def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2),
	(SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>;
	def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt32_su:$src2),
	(SUB64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>;

	// mul reg, reg
	def : Pat<(mul GR16:$src1, GR16:$src2),
	(IMUL16rr GR16:$src1, GR16:$src2)>;
	def : Pat<(mul GR32:$src1, GR32:$src2),
	(IMUL32rr GR32:$src1, GR32:$src2)>;

	// mul reg, mem
	def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
	(IMUL16rm GR16:$src1, addr:$src2)>;
	def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
	(IMUL32rm GR32:$src1, addr:$src2)>;

	// mul reg, imm
	def : Pat<(mul GR16:$src1, imm:$src2),
	(IMUL16rri GR16:$src1, imm:$src2)>;
	def : Pat<(mul GR32:$src1, imm:$src2),
	(IMUL32rri GR32:$src1, imm:$src2)>;
	def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
	(IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
	def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
	(IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;

	// reg = mul mem, imm
	def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
	(IMUL16rmi addr:$src1, imm:$src2)>;
	def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
	(IMUL32rmi addr:$src1, imm:$src2)>;
	def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
	(IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
	def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
	(IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;

	// Patterns for nodes that do not produce flags, for instructions that do.

	// addition
	def : Pat<(add GR64:$src1, GR64:$src2),
	(ADD64rr GR64:$src1, GR64:$src2)>;
	def : Pat<(add GR64:$src1, i64immSExt8:$src2),
	(ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
	def : Pat<(add GR64:$src1, i64immSExt32:$src2),
	(ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
	def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
	(ADD64rm GR64:$src1, addr:$src2)>;

	// subtraction
	def : Pat<(sub GR64:$src1, GR64:$src2),
	(SUB64rr GR64:$src1, GR64:$src2)>;
	def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
	(SUB64rm GR64:$src1, addr:$src2)>;
	def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
	(SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
	def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
	(SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;

	// Multiply
	def : Pat<(mul GR64:$src1, GR64:$src2),
	(IMUL64rr GR64:$src1, GR64:$src2)>;
	def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
	(IMUL64rm GR64:$src1, addr:$src2)>;
	def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
	(IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
	def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
	(IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
	def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
	(IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
	def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
	(IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;

	// Increment/Decrement reg.
	// Do not make INC/DEC if it is slow
	let Predicates = [UseIncDec] in {
	def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>;
	def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>;
	def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>;
	def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>;
	def : Pat<(add GR8:$src, -1), (DEC8r GR8:$src)>;
	def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>;
	def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>;
	def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
	}

	// or reg/reg.
	def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>;
	def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
	def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
	def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>;

	// or reg/mem
	def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
	(OR8rm GR8:$src1, addr:$src2)>;
	def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
	(OR16rm GR16:$src1, addr:$src2)>;
	def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
	(OR32rm GR32:$src1, addr:$src2)>;
	def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
	(OR64rm GR64:$src1, addr:$src2)>;

	// or reg/imm
	def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>;
	def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
	def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
	def : Pat<(or GR16:$src1, i16immSExt8:$src2),
	(OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
	def : Pat<(or GR32:$src1, i32immSExt8:$src2),
	(OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
	def : Pat<(or GR64:$src1, i64immSExt8:$src2),
	(OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
	def : Pat<(or GR64:$src1, i64immSExt32:$src2),
	(OR64ri32 GR64:$src1, i64immSExt32:$src2)>;

	// xor reg/reg
	def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>;
	def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
	def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
	def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>;

	// xor reg/mem
	def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
	(XOR8rm GR8:$src1, addr:$src2)>;
	def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
	(XOR16rm GR16:$src1, addr:$src2)>;
	def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
	(XOR32rm GR32:$src1, addr:$src2)>;
	def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
	(XOR64rm GR64:$src1, addr:$src2)>;

	// xor reg/imm
	def : Pat<(xor GR8:$src1, imm:$src2),
	(XOR8ri GR8:$src1, imm:$src2)>;
	def : Pat<(xor GR16:$src1, imm:$src2),
	(XOR16ri GR16:$src1, imm:$src2)>;
	def : Pat<(xor GR32:$src1, imm:$src2),
	(XOR32ri GR32:$src1, imm:$src2)>;
	def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
	(XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
	def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
	(XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
	def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
	(XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
	def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
	(XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;

	// and reg/reg
	def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>;
	def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
	def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
	def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>;

	// and reg/mem
	def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
	(AND8rm GR8:$src1, addr:$src2)>;
	def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
	(AND16rm GR16:$src1, addr:$src2)>;
	def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
	(AND32rm GR32:$src1, addr:$src2)>;
	def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
	(AND64rm GR64:$src1, addr:$src2)>;

	// and reg/imm
	def : Pat<(and GR8:$src1, imm:$src2),
	(AND8ri GR8:$src1, imm:$src2)>;
	def : Pat<(and GR16:$src1, imm:$src2),
	(AND16ri GR16:$src1, imm:$src2)>;
	def : Pat<(and GR32:$src1, imm:$src2),
	(AND32ri GR32:$src1, imm:$src2)>;
	def : Pat<(and GR16:$src1, i16immSExt8:$src2),
	(AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
	def : Pat<(and GR32:$src1, i32immSExt8:$src2),
	(AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
	def : Pat<(and GR64:$src1, i64immSExt8:$src2),
	(AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
	def : Pat<(and GR64:$src1, i64immSExt32:$src2),
	(AND64ri32 GR64:$src1, i64immSExt32:$src2)>;

	// Bit scan instruction patterns to match explicit zero-undef behavior.
	def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
	def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
	def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
	def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
	def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
	def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;

	// When HasMOVBE is enabled it is possible to get a non-legalized
	// register-register 16 bit bswap. This maps it to a ROL instruction.
	let Predicates = [HasMOVBE] in {
	def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
	}
	Index: head/contrib/llvm/lib/Target/X86/X86InstrControl.td
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86InstrControl.td (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86InstrControl.td (revision 328817)
	@@ -1,358 +1,381 @@
	//===-- X86InstrControl.td - Control Flow Instructions ------ tablegen --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the X86 jump, return, call, and related instructions.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Control Flow Instructions.
	//

	// Return instructions.
	//
	// The X86retflag return instructions are variadic because we may add ST0 and
	// ST1 arguments when returning values on the x87 stack.
	let isTerminator = 1, isReturn = 1, isBarrier = 1,
	hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
	def RETL : I <0xC3, RawFrm, (outs), (ins variable_ops),
	"ret{l}", [], IIC_RET>, OpSize32,
	Requires<[Not64BitMode]>;
	def RETQ : I <0xC3, RawFrm, (outs), (ins variable_ops),
	"ret{q}", [], IIC_RET>, OpSize32,
	Requires<[In64BitMode]>;
	def RETW : I <0xC3, RawFrm, (outs), (ins),
	"ret{w}",
	[], IIC_RET>, OpSize16;
	def RETIL : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
	"ret{l}\t$amt",
	[], IIC_RET_IMM>, OpSize32,
	Requires<[Not64BitMode]>;
	def RETIQ : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
	"ret{q}\t$amt",
	[], IIC_RET_IMM>, OpSize32,
	Requires<[In64BitMode]>;
	def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
	"ret{w}\t$amt",
	[], IIC_RET_IMM>, OpSize16;
	def LRETL : I <0xCB, RawFrm, (outs), (ins),
	"{l}ret{l\|f}", [], IIC_RET>, OpSize32;
	def LRETQ : RI <0xCB, RawFrm, (outs), (ins),
	"{l}ret{\|f}q", [], IIC_RET>, Requires<[In64BitMode]>;
	def LRETW : I <0xCB, RawFrm, (outs), (ins),
	"{l}ret{w\|f}", [], IIC_RET>, OpSize16;
	def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
	"{l}ret{l\|f}\t$amt", [], IIC_RET>, OpSize32;
	def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
	"{l}ret{\|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>;
	def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
	"{l}ret{w\|f}\t$amt", [], IIC_RET>, OpSize16;

	// The machine return from interrupt instruction, but sometimes we need to
	// perform a post-epilogue stack adjustment. Codegen emits the pseudo form
	// which expands to include an SP adjustment if necessary.
	def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>,
	OpSize16;
	def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l\|d}", [],
	IIC_IRET>, OpSize32;
	def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", [],
	IIC_IRET>, Requires<[In64BitMode]>;
	let isCodeGenOnly = 1 in
	def IRET : PseudoI<(outs), (ins i32imm:$adj), [(X86iret timm:$adj)]>;
	def RET : PseudoI<(outs), (ins i32imm:$adj, variable_ops), [(X86retflag timm:$adj)]>;
	}

	// Unconditional branches.
	let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
	def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
	"jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
	let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
	def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst),
	"jmp\t$dst", [], IIC_JMP_REL>, OpSize16;
	def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst),
	"jmp\t$dst", [], IIC_JMP_REL>, OpSize32;
	}
	}

	// Conditional Branches.
	let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
	multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
	def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm,
	[(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>;
	let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
	def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm,
	[], IIC_Jcc>, OpSize16, TB;
	def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm,
	[], IIC_Jcc>, TB, OpSize32;
	}
	}
	}

	defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
	defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>;
	defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
	defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
	defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
	defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
	defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
	defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
	defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
	defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
	defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
	defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
	defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
	defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
	defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
	defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;

	// jcx/jecx/jrcx instructions.
	let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
	// These are the 32-bit versions of this instruction for the asmparser. In
	// 32-bit mode, the address size prefix is jcxz and the unprefixed version is
	// jecxz.
	let Uses = [CX] in
	def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
	"jcxz\t$dst", [], IIC_JCXZ>, AdSize16,
	Requires<[Not64BitMode]>;
	let Uses = [ECX] in
	def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
	"jecxz\t$dst", [], IIC_JCXZ>, AdSize32;

	let Uses = [RCX] in
	def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
	"jrcxz\t$dst", [], IIC_JCXZ>, AdSize64,
	Requires<[In64BitMode]>;
	}

	// Indirect branches
	let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
	def JMP16r : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst",
	[(brind GR16:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
	OpSize16, Sched<[WriteJump]>;
	def JMP16m : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst",
	[(brind (loadi16 addr:$dst))], IIC_JMP_MEM>,
	Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>;

	def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
	[(brind GR32:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
	OpSize32, Sched<[WriteJump]>;
	def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
	[(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
	Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>;

	def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
	[(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
	Sched<[WriteJump]>;
	def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
	[(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
	Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;

	let Predicates = [Not64BitMode] in {
	def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs),
	(ins i16imm:$off, i16imm:$seg),
	"ljmp{w}\t$seg, $off", [],
	IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
	def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs),
	(ins i32imm:$off, i16imm:$seg),
	"ljmp{l}\t$seg, $off", [],
	IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
	}
	def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
	"ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
	Sched<[WriteJump]>;

	def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
	"ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16,
	Sched<[WriteJumpLd]>;
	def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
	"{l}jmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32,
	Sched<[WriteJumpLd]>;
	}


	// Loop instructions
	let SchedRW = [WriteJump] in {
	def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
	def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
	def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
	}

	//===----------------------------------------------------------------------===//
	// Call Instructions...
	//
	let isCall = 1 in
	// All calls clobber the non-callee saved registers. ESP is marked as
	// a use to prevent stack-pointer assignments that appear immediately
	// before calls from potentially appearing dead. Uses for argument
	// registers are added manually.
	let Uses = [ESP, SSP] in {
	def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
	(outs), (ins i32imm_pcrel:$dst),
	"call{l}\t$dst", [], IIC_CALL_RI>, OpSize32,
	Requires<[Not64BitMode]>, Sched<[WriteJump]>;
	let hasSideEffects = 0 in
	def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
	(outs), (ins i16imm_pcrel:$dst),
	"call{w}\t$dst", [], IIC_CALL_RI>, OpSize16,
	Sched<[WriteJump]>;
	def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
	"call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>,
	OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
	def CALL16m : I<0xFF, MRM2m, (outs), (ins i16mem:$dst),
	"call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))],
	IIC_CALL_MEM>, OpSize16,
	Requires<[Not64BitMode,FavorMemIndirectCall]>,
	Sched<[WriteJumpLd]>;
	def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
	"call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
	- OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
	+ OpSize32, Requires<[Not64BitMode,NotUseRetpoline]>,
	+ Sched<[WriteJump]>;
	def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
	"call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
	IIC_CALL_MEM>, OpSize32,
	- Requires<[Not64BitMode,FavorMemIndirectCall]>,
	+ Requires<[Not64BitMode,FavorMemIndirectCall,NotUseRetpoline]>,
	Sched<[WriteJumpLd]>;

	let Predicates = [Not64BitMode] in {
	def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs),
	(ins i16imm:$off, i16imm:$seg),
	"lcall{w}\t$seg, $off", [],
	IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
	def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs),
	(ins i32imm:$off, i16imm:$seg),
	"lcall{l}\t$seg, $off", [],
	IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
	}

	def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
	"lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16,
	Sched<[WriteJumpLd]>;
	def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
	"{l}call{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32,
	Sched<[WriteJumpLd]>;
	}


	// Tail call stuff.
	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
	isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
	let Uses = [ESP, SSP] in {
	def TCRETURNdi : PseudoI<(outs),
	(ins i32imm_pcrel:$dst, i32imm:$offset), []>, NotMemoryFoldable;
	def TCRETURNri : PseudoI<(outs),
	(ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable;
	let mayLoad = 1 in
	def TCRETURNmi : PseudoI<(outs),
	(ins i32mem_TC:$dst, i32imm:$offset), []>;

	// FIXME: The should be pseudo instructions that are lowered when going to
	// mcinst.
	def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
	(ins i32imm_pcrel:$dst),
	"jmp\t$dst",
	[], IIC_JMP_REL>;

	def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
	"", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead.
	let mayLoad = 1 in
	def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
	"jmp{l}\t{*}$dst", [], IIC_JMP_MEM>;
	}

	// Conditional tail calls are similar to the above, but they are branches
	// rather than barriers, and they use EFLAGS.
	let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
	isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
	let Uses = [ESP, EFLAGS, SSP] in {
	def TCRETURNdicc : PseudoI<(outs),
	(ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>;

	// This gets substituted to a conditional jump instruction in MC lowering.
	def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs),
	(ins i32imm_pcrel:$dst, i32imm:$cond),
	"",
	[], IIC_JMP_REL>;
	}


	//===----------------------------------------------------------------------===//
	// Call Instructions...
	//

	// RSP is marked as a use to prevent stack-pointer assignments that appear
	// immediately before calls from potentially appearing dead. Uses for argument
	// registers are added manually.
	let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
	// NOTE: this pattern doesn't match "X86call imm", because we do not know
	// that the offset between an arbitrary immediate and the call will fit in
	// the 32-bit pcrel field that we have.
	def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
	(outs), (ins i64i32imm_pcrel:$dst),
	"call{q}\t$dst", [], IIC_CALL_RI>, OpSize32,
	Requires<[In64BitMode]>;
	def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
	"call{q}\t{*}$dst", [(X86call GR64:$dst)],
	IIC_CALL_RI>,
	- Requires<[In64BitMode]>;
	+ Requires<[In64BitMode,NotUseRetpoline]>;
	def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
	"call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
	IIC_CALL_MEM>,
	- Requires<[In64BitMode,FavorMemIndirectCall]>;
	+ Requires<[In64BitMode,FavorMemIndirectCall,
	+ NotUseRetpoline]>;

	def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
	"lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
	}

	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
	isCodeGenOnly = 1, Uses = [RSP, SSP], usesCustomInserter = 1,
	SchedRW = [WriteJump] in {
	def TCRETURNdi64 : PseudoI<(outs),
	(ins i64i32imm_pcrel:$dst, i32imm:$offset),
	[]>;
	def TCRETURNri64 : PseudoI<(outs),
	(ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable;
	let mayLoad = 1 in
	def TCRETURNmi64 : PseudoI<(outs),
	(ins i64mem_TC:$dst, i32imm:$offset), []>, NotMemoryFoldable;

	def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
	"jmp\t$dst", [], IIC_JMP_REL>;

	def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
	"jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;

	let mayLoad = 1 in
	def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
	"jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;

	// Win64 wants indirect jumps leaving the function to have a REX_W prefix.
	let hasREX_WPrefix = 1 in {
	def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
	"rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;

	let mayLoad = 1 in
	def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
	"rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
	+ }
	+}
	+
	+let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
	+ Uses = [RSP, SSP],
	+ usesCustomInserter = 1,
	+ SchedRW = [WriteJump] in {
	+ def RETPOLINE_CALL32 :
	+ PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>,
	+ Requires<[Not64BitMode,UseRetpoline]>;
	+
	+ def RETPOLINE_CALL64 :
	+ PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>,
	+ Requires<[In64BitMode,UseRetpoline]>;
	+
	+ // Retpoline variant of indirect tail calls.
	+ let isTerminator = 1, isReturn = 1, isBarrier = 1 in {
	+ def RETPOLINE_TCRETURN64 :
	+ PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>;
	+ def RETPOLINE_TCRETURN32 :
	+ PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>;
	}
	}

	// Conditional tail calls are similar to the above, but they are branches
	// rather than barriers, and they use EFLAGS.
	let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
	isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
	let Uses = [RSP, EFLAGS, SSP] in {
	def TCRETURNdi64cc : PseudoI<(outs),
	(ins i64i32imm_pcrel:$dst, i32imm:$offset,
	i32imm:$cond), []>;

	// This gets substituted to a conditional jump instruction in MC lowering.
	def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs),
	(ins i64i32imm_pcrel:$dst, i32imm:$cond),
	"",
	[], IIC_JMP_REL>;
	}
	Index: head/contrib/llvm/lib/Target/X86/X86InstrInfo.td
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86InstrInfo.td (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86InstrInfo.td (revision 328817)
	@@ -1,3348 +1,3350 @@
	//===-- X86InstrInfo.td - Main X86 Instruction Definition --- tablegen --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the X86 instruction set, defining the instructions, and
	// properties of the instructions which are needed for code generation, machine
	// code emission, and analysis.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// X86 specific DAG Nodes.
	//

	def SDTIntShiftDOp: SDTypeProfile<1, 3,
	[SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
	SDTCisInt<0>, SDTCisInt<3>]>;

	def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;

	def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
	//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;

	def SDTX86Cmov : SDTypeProfile<1, 4,
	[SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
	SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;

	// Unary and binary operator instructions that set EFLAGS as a side-effect.
	def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
	[SDTCisSameAs<0, 2>,
	SDTCisInt<0>, SDTCisVT<1, i32>]>;

	def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
	[SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>,
	SDTCisInt<0>, SDTCisVT<1, i32>]>;

	// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS
	def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
	[SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>,
	SDTCisInt<0>,
	SDTCisVT<1, i32>,
	SDTCisVT<4, i32>]>;
	// RES1, RES2, FLAGS = op LHS, RHS
	def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2,
	[SDTCisSameAs<0, 1>,
	SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>,
	SDTCisInt<0>, SDTCisVT<1, i32>]>;
	def SDTX86BrCond : SDTypeProfile<0, 3,
	[SDTCisVT<0, OtherVT>,
	SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;

	def SDTX86SetCC : SDTypeProfile<1, 2,
	[SDTCisVT<0, i8>,
	SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
	def SDTX86SetCC_C : SDTypeProfile<1, 2,
	[SDTCisInt<0>,
	SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;

	def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;

	def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;

	def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
	SDTCisVT<2, i8>]>;
	def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
	def SDTX86caspairSaveEbx8 : SDTypeProfile<1, 3,
	[SDTCisVT<0, i32>, SDTCisPtrTy<1>,
	SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
	def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3,
	[SDTCisVT<0, i64>, SDTCisPtrTy<1>,
	SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;

	def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
	SDTCisPtrTy<1>,
	SDTCisInt<2>]>;

	def SDTLockUnaryArithWithFlags : SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
	SDTCisPtrTy<1>]>;

	def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;

	def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
	SDTCisVT<1, i32>]>;
	def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
	SDTCisVT<1, i32>]>;

	def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;

	def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
	SDTCisVT<1, iPTR>,
	SDTCisVT<2, iPTR>]>;

	def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
	SDTCisPtrTy<1>,
	SDTCisVT<2, i32>,
	SDTCisVT<3, i8>,
	SDTCisVT<4, i32>]>;

	def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;

	def SDTX86Void : SDTypeProfile<0, 0, []>;

	def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;

	def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;

	def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;

	def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;

	def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;

	def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;

	def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;

	def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;

	def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;

	def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
	[SDNPHasChain,SDNPSideEffect]>;
	def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
	[SDNPHasChain]>;


	def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>;
	def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>;
	def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>;
	def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>;

	def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>;
	def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>;

	def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
	def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
	[SDNPHasChain]>;
	def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>;
	def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;

	def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>;

	def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand,
	[SDNPHasChain, SDNPSideEffect]>;

	def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand,
	[SDNPHasChain, SDNPSideEffect]>;

	def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
	SDNPMayLoad, SDNPMemOperand]>;
	def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
	SDNPMayLoad, SDNPMemOperand]>;
	def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
	SDNPMayLoad, SDNPMemOperand]>;
	def X86cas8save_ebx : SDNode<"X86ISD::LCMPXCHG8_SAVE_EBX_DAG",
	SDTX86caspairSaveEbx8,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue,
	SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
	def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG",
	SDTX86caspairSaveRbx16,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue,
	SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;

	def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
	def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret,
	[SDNPHasChain, SDNPOptInGlue]>;

	def X86vastart_save_xmm_regs :
	SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
	SDT_X86VASTART_SAVE_XMM_REGS,
	[SDNPHasChain, SDNPVariadic]>;
	def X86vaarg64 :
	SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64,
	[SDNPHasChain, SDNPMayLoad, SDNPMayStore,
	SDNPMemOperand]>;
	def X86callseq_start :
	SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
	[SDNPHasChain, SDNPOutGlue]>;
	def X86callseq_end :
	SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

	def X86call : SDNode<"X86ISD::CALL", SDT_X86Call,
	[SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
	SDNPVariadic]>;

	def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>;
	def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
	SDNPMayLoad]>;

	def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
	[SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
	def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
	[SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
	def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void,
	[SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;

	def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>;
	def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;

	def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER",
	SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
	SDTCisInt<1>]>>;

	def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

	def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

	def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
	[SDNPHasChain]>;

	def X86eh_sjlj_setjmp : SDNode<"X86ISD::EH_SJLJ_SETJMP",
	SDTypeProfile<1, 1, [SDTCisInt<0>,
	SDTCisPtrTy<1>]>,
	[SDNPHasChain, SDNPSideEffect]>;
	def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP",
	SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
	[SDNPHasChain, SDNPSideEffect]>;
	def X86eh_sjlj_setup_dispatch : SDNode<"X86ISD::EH_SJLJ_SETUP_DISPATCH",
	SDTypeProfile<0, 0, []>,
	[SDNPHasChain, SDNPSideEffect]>;

	def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;

	def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags,
	[SDNPCommutative]>;
	def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>;
	def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
	[SDNPCommutative]>;
	def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags,
	[SDNPCommutative]>;
	def X86adc_flag : SDNode<"X86ISD::ADC", SDTBinaryArithWithFlagsInOut>;
	def X86sbb_flag : SDNode<"X86ISD::SBB", SDTBinaryArithWithFlagsInOut>;

	def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>;
	def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>;
	def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags,
	[SDNPCommutative]>;
	def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags,
	[SDNPCommutative]>;
	def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags,
	[SDNPCommutative]>;

	def X86lock_add : SDNode<"X86ISD::LADD", SDTLockBinaryArithWithFlags,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
	SDNPMemOperand]>;
	def X86lock_sub : SDNode<"X86ISD::LSUB", SDTLockBinaryArithWithFlags,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
	SDNPMemOperand]>;
	def X86lock_or : SDNode<"X86ISD::LOR", SDTLockBinaryArithWithFlags,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
	SDNPMemOperand]>;
	def X86lock_xor : SDNode<"X86ISD::LXOR", SDTLockBinaryArithWithFlags,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
	SDNPMemOperand]>;
	def X86lock_and : SDNode<"X86ISD::LAND", SDTLockBinaryArithWithFlags,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
	SDNPMemOperand]>;

	def X86lock_inc : SDNode<"X86ISD::LINC", SDTLockUnaryArithWithFlags,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
	SDNPMemOperand]>;
	def X86lock_dec : SDNode<"X86ISD::LDEC", SDTLockUnaryArithWithFlags,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
	SDNPMemOperand]>;

	def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;

	def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
	[SDNPHasChain, SDNPOutGlue]>;

	def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
	[SDNPHasChain]>;

	def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

	def X86lwpins : SDNode<"X86ISD::LWPINS",
	SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>,
	SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPSideEffect]>;

	//===----------------------------------------------------------------------===//
	// X86 Operand Definitions.
	//

	// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for
	// the index operand of an address, to conform to x86 encoding restrictions.
	def ptr_rc_nosp : PointerLikeRegClass<1>;

	// *mem - Operand definitions for the funky X86 addressing mode operands.
	//
	def X86MemAsmOperand : AsmOperandClass {
	let Name = "Mem";
	}
	let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in {
	def X86Mem8AsmOperand : AsmOperandClass { let Name = "Mem8"; }
	def X86Mem16AsmOperand : AsmOperandClass { let Name = "Mem16"; }
	def X86Mem32AsmOperand : AsmOperandClass { let Name = "Mem32"; }
	def X86Mem64AsmOperand : AsmOperandClass { let Name = "Mem64"; }
	def X86Mem80AsmOperand : AsmOperandClass { let Name = "Mem80"; }
	def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; }
	def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; }
	def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; }
	// Gather mem operands
	def X86Mem64_RC128Operand : AsmOperandClass { let Name = "Mem64_RC128"; }
	def X86Mem128_RC128Operand : AsmOperandClass { let Name = "Mem128_RC128"; }
	def X86Mem256_RC128Operand : AsmOperandClass { let Name = "Mem256_RC128"; }
	def X86Mem128_RC256Operand : AsmOperandClass { let Name = "Mem128_RC256"; }
	def X86Mem256_RC256Operand : AsmOperandClass { let Name = "Mem256_RC256"; }

	def X86Mem64_RC128XOperand : AsmOperandClass { let Name = "Mem64_RC128X"; }
	def X86Mem128_RC128XOperand : AsmOperandClass { let Name = "Mem128_RC128X"; }
	def X86Mem256_RC128XOperand : AsmOperandClass { let Name = "Mem256_RC128X"; }
	def X86Mem128_RC256XOperand : AsmOperandClass { let Name = "Mem128_RC256X"; }
	def X86Mem256_RC256XOperand : AsmOperandClass { let Name = "Mem256_RC256X"; }
	def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; }
	def X86Mem256_RC512Operand : AsmOperandClass { let Name = "Mem256_RC512"; }
	def X86Mem512_RC512Operand : AsmOperandClass { let Name = "Mem512_RC512"; }
	}

	def X86AbsMemAsmOperand : AsmOperandClass {
	let Name = "AbsMem";
	let SuperClasses = [X86MemAsmOperand];
	}

	class X86MemOperand<string printMethod,
	AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> {
	let PrintMethod = printMethod;
	let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
	let ParserMatchClass = parserMatchClass;
	let OperandType = "OPERAND_MEMORY";
	}

	// Gather mem operands
	class X86VMemOperand<RegisterClass RC, string printMethod,
	AsmOperandClass parserMatchClass>
	: X86MemOperand<printMethod, parserMatchClass> {
	let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG);
	}

	def anymem : X86MemOperand<"printanymem">;

	def opaque32mem : X86MemOperand<"printopaquemem">;
	def opaque48mem : X86MemOperand<"printopaquemem">;
	def opaque80mem : X86MemOperand<"printopaquemem">;
	def opaque512mem : X86MemOperand<"printopaquemem">;

	def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>;
	def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>;
	def i32mem : X86MemOperand<"printi32mem", X86Mem32AsmOperand>;
	def i64mem : X86MemOperand<"printi64mem", X86Mem64AsmOperand>;
	def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>;
	def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>;
	def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>;
	def f32mem : X86MemOperand<"printf32mem", X86Mem32AsmOperand>;
	def f64mem : X86MemOperand<"printf64mem", X86Mem64AsmOperand>;
	def f80mem : X86MemOperand<"printf80mem", X86Mem80AsmOperand>;
	def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>;
	def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>;
	def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>;

	def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>;

	// Gather mem operands
	def vx64mem : X86VMemOperand<VR128, "printi64mem", X86Mem64_RC128Operand>;
	def vx128mem : X86VMemOperand<VR128, "printi128mem", X86Mem128_RC128Operand>;
	def vx256mem : X86VMemOperand<VR128, "printi256mem", X86Mem256_RC128Operand>;
	def vy128mem : X86VMemOperand<VR256, "printi128mem", X86Mem128_RC256Operand>;
	def vy256mem : X86VMemOperand<VR256, "printi256mem", X86Mem256_RC256Operand>;

	def vx64xmem : X86VMemOperand<VR128X, "printi64mem", X86Mem64_RC128XOperand>;
	def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>;
	def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>;
	def vy128xmem : X86VMemOperand<VR256X, "printi128mem", X86Mem128_RC256XOperand>;
	def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>;
	def vy512mem : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>;
	def vz256xmem : X86VMemOperand<VR512, "printi256mem", X86Mem256_RC512Operand>;
	def vz512mem : X86VMemOperand<VR512, "printi512mem", X86Mem512_RC512Operand>;

	// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
	// of a plain GPR, so that it doesn't potentially require a REX prefix.
	def ptr_rc_norex : PointerLikeRegClass<2>;
	def ptr_rc_norex_nosp : PointerLikeRegClass<3>;

	def i8mem_NOREX : Operand<iPTR> {
	let PrintMethod = "printi8mem";
	let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm,
	SEGMENT_REG);
	let ParserMatchClass = X86Mem8AsmOperand;
	let OperandType = "OPERAND_MEMORY";
	}

	// GPRs available for tailcall.
	// It represents GR32_TC, GR64_TC or GR64_TCW64.
	def ptr_rc_tailcall : PointerLikeRegClass<4>;

	// Special i32mem for addresses of load folding tail calls. These are not
	// allowed to use callee-saved registers since they must be scheduled
	// after callee-saved register are popped.
	def i32mem_TC : Operand<i32> {
	let PrintMethod = "printi32mem";
	let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
	i32imm, SEGMENT_REG);
	let ParserMatchClass = X86Mem32AsmOperand;
	let OperandType = "OPERAND_MEMORY";
	}

	// Special i64mem for addresses of load folding tail calls. These are not
	// allowed to use callee-saved registers since they must be scheduled
	// after callee-saved register are popped.
	def i64mem_TC : Operand<i64> {
	let PrintMethod = "printi64mem";
	let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
	ptr_rc_tailcall, i32imm, SEGMENT_REG);
	let ParserMatchClass = X86Mem64AsmOperand;
	let OperandType = "OPERAND_MEMORY";
	}

	let OperandType = "OPERAND_PCREL",
	ParserMatchClass = X86AbsMemAsmOperand,
	PrintMethod = "printPCRelImm" in {
	def i32imm_pcrel : Operand<i32>;
	def i16imm_pcrel : Operand<i16>;

	// Branch targets have OtherVT type and print as pc-relative values.
	def brtarget : Operand<OtherVT>;
	def brtarget8 : Operand<OtherVT>;

	}

	// Special parser to detect 16-bit mode to select 16-bit displacement.
	def X86AbsMem16AsmOperand : AsmOperandClass {
	let Name = "AbsMem16";
	let RenderMethod = "addAbsMemOperands";
	let SuperClasses = [X86AbsMemAsmOperand];
	}

	// Branch targets have OtherVT type and print as pc-relative values.
	let OperandType = "OPERAND_PCREL",
	PrintMethod = "printPCRelImm" in {
	let ParserMatchClass = X86AbsMem16AsmOperand in
	def brtarget16 : Operand<OtherVT>;
	let ParserMatchClass = X86AbsMemAsmOperand in
	def brtarget32 : Operand<OtherVT>;
	}

	let RenderMethod = "addSrcIdxOperands" in {
	def X86SrcIdx8Operand : AsmOperandClass {
	let Name = "SrcIdx8";
	let SuperClasses = [X86Mem8AsmOperand];
	}
	def X86SrcIdx16Operand : AsmOperandClass {
	let Name = "SrcIdx16";
	let SuperClasses = [X86Mem16AsmOperand];
	}
	def X86SrcIdx32Operand : AsmOperandClass {
	let Name = "SrcIdx32";
	let SuperClasses = [X86Mem32AsmOperand];
	}
	def X86SrcIdx64Operand : AsmOperandClass {
	let Name = "SrcIdx64";
	let SuperClasses = [X86Mem64AsmOperand];
	}
	} // RenderMethod = "addSrcIdxOperands"

	let RenderMethod = "addDstIdxOperands" in {
	def X86DstIdx8Operand : AsmOperandClass {
	let Name = "DstIdx8";
	let SuperClasses = [X86Mem8AsmOperand];
	}
	def X86DstIdx16Operand : AsmOperandClass {
	let Name = "DstIdx16";
	let SuperClasses = [X86Mem16AsmOperand];
	}
	def X86DstIdx32Operand : AsmOperandClass {
	let Name = "DstIdx32";
	let SuperClasses = [X86Mem32AsmOperand];
	}
	def X86DstIdx64Operand : AsmOperandClass {
	let Name = "DstIdx64";
	let SuperClasses = [X86Mem64AsmOperand];
	}
	} // RenderMethod = "addDstIdxOperands"

	let RenderMethod = "addMemOffsOperands" in {
	def X86MemOffs16_8AsmOperand : AsmOperandClass {
	let Name = "MemOffs16_8";
	let SuperClasses = [X86Mem8AsmOperand];
	}
	def X86MemOffs16_16AsmOperand : AsmOperandClass {
	let Name = "MemOffs16_16";
	let SuperClasses = [X86Mem16AsmOperand];
	}
	def X86MemOffs16_32AsmOperand : AsmOperandClass {
	let Name = "MemOffs16_32";
	let SuperClasses = [X86Mem32AsmOperand];
	}
	def X86MemOffs32_8AsmOperand : AsmOperandClass {
	let Name = "MemOffs32_8";
	let SuperClasses = [X86Mem8AsmOperand];
	}
	def X86MemOffs32_16AsmOperand : AsmOperandClass {
	let Name = "MemOffs32_16";
	let SuperClasses = [X86Mem16AsmOperand];
	}
	def X86MemOffs32_32AsmOperand : AsmOperandClass {
	let Name = "MemOffs32_32";
	let SuperClasses = [X86Mem32AsmOperand];
	}
	def X86MemOffs32_64AsmOperand : AsmOperandClass {
	let Name = "MemOffs32_64";
	let SuperClasses = [X86Mem64AsmOperand];
	}
	def X86MemOffs64_8AsmOperand : AsmOperandClass {
	let Name = "MemOffs64_8";
	let SuperClasses = [X86Mem8AsmOperand];
	}
	def X86MemOffs64_16AsmOperand : AsmOperandClass {
	let Name = "MemOffs64_16";
	let SuperClasses = [X86Mem16AsmOperand];
	}
	def X86MemOffs64_32AsmOperand : AsmOperandClass {
	let Name = "MemOffs64_32";
	let SuperClasses = [X86Mem32AsmOperand];
	}
	def X86MemOffs64_64AsmOperand : AsmOperandClass {
	let Name = "MemOffs64_64";
	let SuperClasses = [X86Mem64AsmOperand];
	}
	} // RenderMethod = "addMemOffsOperands"

	class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
	: X86MemOperand<printMethod, parserMatchClass> {
	let MIOperandInfo = (ops ptr_rc, SEGMENT_REG);
	}

	class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
	: X86MemOperand<printMethod, parserMatchClass> {
	let MIOperandInfo = (ops ptr_rc);
	}

	def srcidx8 : X86SrcIdxOperand<"printSrcIdx8", X86SrcIdx8Operand>;
	def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>;
	def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>;
	def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>;
	def dstidx8 : X86DstIdxOperand<"printDstIdx8", X86DstIdx8Operand>;
	def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>;
	def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>;
	def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>;

	class X86MemOffsOperand<Operand immOperand, string printMethod,
	AsmOperandClass parserMatchClass>
	: X86MemOperand<printMethod, parserMatchClass> {
	let MIOperandInfo = (ops immOperand, SEGMENT_REG);
	}

	def offset16_8 : X86MemOffsOperand<i16imm, "printMemOffs8",
	X86MemOffs16_8AsmOperand>;
	def offset16_16 : X86MemOffsOperand<i16imm, "printMemOffs16",
	X86MemOffs16_16AsmOperand>;
	def offset16_32 : X86MemOffsOperand<i16imm, "printMemOffs32",
	X86MemOffs16_32AsmOperand>;
	def offset32_8 : X86MemOffsOperand<i32imm, "printMemOffs8",
	X86MemOffs32_8AsmOperand>;
	def offset32_16 : X86MemOffsOperand<i32imm, "printMemOffs16",
	X86MemOffs32_16AsmOperand>;
	def offset32_32 : X86MemOffsOperand<i32imm, "printMemOffs32",
	X86MemOffs32_32AsmOperand>;
	def offset32_64 : X86MemOffsOperand<i32imm, "printMemOffs64",
	X86MemOffs32_64AsmOperand>;
	def offset64_8 : X86MemOffsOperand<i64imm, "printMemOffs8",
	X86MemOffs64_8AsmOperand>;
	def offset64_16 : X86MemOffsOperand<i64imm, "printMemOffs16",
	X86MemOffs64_16AsmOperand>;
	def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32",
	X86MemOffs64_32AsmOperand>;
	def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64",
	X86MemOffs64_64AsmOperand>;

	def SSECC : Operand<i8> {
	let PrintMethod = "printSSEAVXCC";
	let OperandType = "OPERAND_IMMEDIATE";
	}

	def AVXCC : Operand<i8> {
	let PrintMethod = "printSSEAVXCC";
	let OperandType = "OPERAND_IMMEDIATE";
	}

	def AVX512ICC : Operand<i8> {
	let PrintMethod = "printSSEAVXCC";
	let OperandType = "OPERAND_IMMEDIATE";
	}

	def XOPCC : Operand<i8> {
	let PrintMethod = "printXOPCC";
	let OperandType = "OPERAND_IMMEDIATE";
	}

	class ImmSExtAsmOperandClass : AsmOperandClass {
	let SuperClasses = [ImmAsmOperand];
	let RenderMethod = "addImmOperands";
	}

	def X86GR32orGR64AsmOperand : AsmOperandClass {
	let Name = "GR32orGR64";
	}

	def GR32orGR64 : RegisterOperand<GR32> {
	let ParserMatchClass = X86GR32orGR64AsmOperand;
	}
	def AVX512RCOperand : AsmOperandClass {
	let Name = "AVX512RC";
	}
	def AVX512RC : Operand<i32> {
	let PrintMethod = "printRoundingControl";
	let OperandType = "OPERAND_IMMEDIATE";
	let ParserMatchClass = AVX512RCOperand;
	}

	// Sign-extended immediate classes. We don't need to define the full lattice
	// here because there is no instruction with an ambiguity between ImmSExti64i32
	// and ImmSExti32i8.
	//
	// The strange ranges come from the fact that the assembler always works with
	// 64-bit immediates, but for a 16-bit target value we want to accept both "-1"
	// (which will be a -1ULL), and "0xFF" (-1 in 16-bits).

	// [0, 0x7FFFFFFF] \|
	// [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
	def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass {
	let Name = "ImmSExti64i32";
	}

	// [0, 0x0000007F] \| [0x000000000000FF80, 0x000000000000FFFF] \|
	// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
	def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass {
	let Name = "ImmSExti16i8";
	let SuperClasses = [ImmSExti64i32AsmOperand];
	}

	// [0, 0x0000007F] \| [0x00000000FFFFFF80, 0x00000000FFFFFFFF] \|
	// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
	def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
	let Name = "ImmSExti32i8";
	}

	// [0, 0x0000007F] \|
	// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
	def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
	let Name = "ImmSExti64i8";
	let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand,
	ImmSExti64i32AsmOperand];
	}

	// Unsigned immediate used by SSE/AVX instructions
	// [0, 0xFF]
	// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
	def ImmUnsignedi8AsmOperand : AsmOperandClass {
	let Name = "ImmUnsignedi8";
	let RenderMethod = "addImmOperands";
	}

	// A couple of more descriptive operand definitions.
	// 16-bits but only 8 bits are significant.
	def i16i8imm : Operand<i16> {
	let ParserMatchClass = ImmSExti16i8AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}
	// 32-bits but only 8 bits are significant.
	def i32i8imm : Operand<i32> {
	let ParserMatchClass = ImmSExti32i8AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	// 64-bits but only 32 bits are significant.
	def i64i32imm : Operand<i64> {
	let ParserMatchClass = ImmSExti64i32AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	// 64-bits but only 8 bits are significant.
	def i64i8imm : Operand<i64> {
	let ParserMatchClass = ImmSExti64i8AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	// Unsigned 8-bit immediate used by SSE/AVX instructions.
	def u8imm : Operand<i8> {
	let PrintMethod = "printU8Imm";
	let ParserMatchClass = ImmUnsignedi8AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	// 32-bit immediate but only 8-bits are significant and they are unsigned.
	// Used by some SSE/AVX instructions that use intrinsics.
	def i32u8imm : Operand<i32> {
	let PrintMethod = "printU8Imm";
	let ParserMatchClass = ImmUnsignedi8AsmOperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	// 64-bits but only 32 bits are significant, and those bits are treated as being
	// pc relative.
	def i64i32imm_pcrel : Operand<i64> {
	let PrintMethod = "printPCRelImm";
	let ParserMatchClass = X86AbsMemAsmOperand;
	let OperandType = "OPERAND_PCREL";
	}

	def lea64_32mem : Operand<i32> {
	let PrintMethod = "printanymem";
	let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
	let ParserMatchClass = X86MemAsmOperand;
	}

	// Memory operands that use 64-bit pointers in both ILP32 and LP64.
	def lea64mem : Operand<i64> {
	let PrintMethod = "printanymem";
	let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
	let ParserMatchClass = X86MemAsmOperand;
	}


	//===----------------------------------------------------------------------===//
	// X86 Complex Pattern Definitions.
	//

	// Define X86-specific addressing mode.
	def addr : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>;
	def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr",
	[add, sub, mul, X86mul_imm, shl, or, frameindex],
	[]>;
	// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
	def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr",
	[add, sub, mul, X86mul_imm, shl, or,
	frameindex, X86WrapperRIP],
	[]>;

	def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
	[tglobaltlsaddr], []>;

	def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
	[tglobaltlsaddr], []>;

	def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr",
	[add, sub, mul, X86mul_imm, shl, or, frameindex,
	X86WrapperRIP], []>;

	def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
	[tglobaltlsaddr], []>;

	def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
	[tglobaltlsaddr], []>;

	def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>;

	// A relocatable immediate is either an immediate operand or an operand that can
	// be relocated by the linker to an immediate, such as a regular symbol in
	// non-PIC code.
	def relocImm : ComplexPattern<iAny, 1, "selectRelocImm", [imm, X86Wrapper], [],
	0>;

	//===----------------------------------------------------------------------===//
	// X86 Instruction Predicate Definitions.
	def TruePredicate : Predicate<"true">;

	def HasCMov : Predicate<"Subtarget->hasCMov()">;
	def NoCMov : Predicate<"!Subtarget->hasCMov()">;

	def HasMMX : Predicate<"Subtarget->hasMMX()">;
	def Has3DNow : Predicate<"Subtarget->has3DNow()">;
	def Has3DNowA : Predicate<"Subtarget->has3DNowA()">;
	def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
	def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
	def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
	def UseSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
	def HasSSE3 : Predicate<"Subtarget->hasSSE3()">;
	def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
	def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
	def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
	def HasSSE41 : Predicate<"Subtarget->hasSSE41()">;
	def NoSSE41 : Predicate<"!Subtarget->hasSSE41()">;
	def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
	def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
	def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
	def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">;
	def NoAVX : Predicate<"!Subtarget->hasAVX()">;
	def HasAVX : Predicate<"Subtarget->hasAVX()">;
	def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
	def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
	def HasAVX512 : Predicate<"Subtarget->hasAVX512()">,
	AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">;
	def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
	def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
	def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">;
	def HasCDI : Predicate<"Subtarget->hasCDI()">,
	AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">;
	def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">,
	AssemblerPredicate<"FeatureVPOPCNTDQ", "AVX-512 VPOPCNTDQ ISA">;
	def HasPFI : Predicate<"Subtarget->hasPFI()">,
	AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">;
	def HasERI : Predicate<"Subtarget->hasERI()">,
	AssemblerPredicate<"FeatureERI", "AVX-512 ER ISA">;
	def HasDQI : Predicate<"Subtarget->hasDQI()">,
	AssemblerPredicate<"FeatureDQI", "AVX-512 DQ ISA">;
	def NoDQI : Predicate<"!Subtarget->hasDQI()">;
	def HasBWI : Predicate<"Subtarget->hasBWI()">,
	AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">;
	def NoBWI : Predicate<"!Subtarget->hasBWI()">;
	def HasVLX : Predicate<"Subtarget->hasVLX()">,
	AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">;
	def NoVLX : Predicate<"!Subtarget->hasVLX()">;
	def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() \|\| !Subtarget->hasBWI()">;
	def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() \|\| !Subtarget->hasDQI()">;
	def PKU : Predicate<"Subtarget->hasPKU()">;
	def HasVNNI : Predicate<"Subtarget->hasVNNI()">,
	AssemblerPredicate<"FeatureVNNI", "AVX-512 VNNI ISA">;

	def HasBITALG : Predicate<"Subtarget->hasBITALG()">,
	AssemblerPredicate<"FeatureBITALG", "AVX-512 BITALG ISA">;
	def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
	def HasAES : Predicate<"Subtarget->hasAES()">;
	def HasVAES : Predicate<"Subtarget->hasVAES()">;
	def NoVLX_Or_NoVAES : Predicate<"!Subtarget->hasVLX() \|\| !Subtarget->hasVAES()">;
	def HasFXSR : Predicate<"Subtarget->hasFXSR()">;
	def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">;
	def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">;
	def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">;
	def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">;
	def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">;
	def NoVLX_Or_NoVPCLMULQDQ :
	Predicate<"!Subtarget->hasVLX() \|\| !Subtarget->hasVPCLMULQDQ()">;
	def HasVPCLMULQDQ : Predicate<"Subtarget->hasVPCLMULQDQ()">;
	def HasGFNI : Predicate<"Subtarget->hasGFNI()">;
	def HasFMA : Predicate<"Subtarget->hasFMA()">;
	def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
	def NoFMA4 : Predicate<"!Subtarget->hasFMA4()">;
	def HasXOP : Predicate<"Subtarget->hasXOP()">;
	def HasTBM : Predicate<"Subtarget->hasTBM()">;
	def NoTBM : Predicate<"!Subtarget->hasTBM()">;
	def HasLWP : Predicate<"Subtarget->hasLWP()">;
	def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
	def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">;
	def HasF16C : Predicate<"Subtarget->hasF16C()">;
	def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">;
	def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
	def HasBMI : Predicate<"Subtarget->hasBMI()">;
	def HasBMI2 : Predicate<"Subtarget->hasBMI2()">;
	def NoBMI2 : Predicate<"!Subtarget->hasBMI2()">;
	def HasVBMI : Predicate<"Subtarget->hasVBMI()">,
	AssemblerPredicate<"FeatureVBMI", "AVX-512 VBMI ISA">;
	def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">,
	AssemblerPredicate<"FeatureVBMI2", "AVX-512 VBMI2 ISA">;
	def HasIFMA : Predicate<"Subtarget->hasIFMA()">,
	AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">;
	def HasRTM : Predicate<"Subtarget->hasRTM()">;
	def HasADX : Predicate<"Subtarget->hasADX()">;
	def HasSHA : Predicate<"Subtarget->hasSHA()">;
	def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">;
	def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">;
	def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">;
	def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">;
	def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
	def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">;
	def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">;
	def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">;
	def HasCLZERO : Predicate<"Subtarget->hasCLZERO()">;
	def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
	def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
	def HasMPX : Predicate<"Subtarget->hasMPX()">;
	def HasSHSTK : Predicate<"Subtarget->hasSHSTK()">;
	def HasIBT : Predicate<"Subtarget->hasIBT()">;
	def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
	def HasCLWB : Predicate<"Subtarget->hasCLWB()">;
	def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
	def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
	AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
	def In64BitMode : Predicate<"Subtarget->is64Bit()">,
	AssemblerPredicate<"Mode64Bit", "64-bit mode">;
	def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">;
	def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">;
	def In16BitMode : Predicate<"Subtarget->is16Bit()">,
	AssemblerPredicate<"Mode16Bit", "16-bit mode">;
	def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
	AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">;
	def In32BitMode : Predicate<"Subtarget->is32Bit()">,
	AssemblerPredicate<"Mode32Bit", "32-bit mode">;
	def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
	def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">;
	def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() \|\|"
	"Subtarget->getFrameLowering()->hasFP(*MF)"> {
	let RecomputePerFunction = 1;
	}
	def IsPS4 : Predicate<"Subtarget->isTargetPS4()">;
	def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">;
	def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
	def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
	def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
	def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
	def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small \|\|"
	"TM.getCodeModel() == CodeModel::Kernel">;
	def IsNotPIC : Predicate<"!TM.isPositionIndependent()">;

	// We could compute these on a per-module basis but doing so requires accessing
	// the Function object through the <Target>Subtarget and objections were raised
	// to that (see post-commit review comments for r301750).
	let RecomputePerFunction = 1 in {
	def OptForSize : Predicate<"MF->getFunction().optForSize()">;
	def OptForMinSize : Predicate<"MF->getFunction().optForMinSize()">;
	def OptForSpeed : Predicate<"!MF->getFunction().optForSize()">;
	def UseIncDec : Predicate<"!Subtarget->slowIncDec() \|\| "
	"MF->getFunction().optForSize()">;
	}

	def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
	def FavorMemIndirectCall : Predicate<"!Subtarget->slowTwoMemOps()">;
	def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
	def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
	def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
	def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
	def HasMFence : Predicate<"Subtarget->hasMFence()">;
	+def UseRetpoline : Predicate<"Subtarget->useRetpoline()">;
	+def NotUseRetpoline : Predicate<"!Subtarget->useRetpoline()">;

	//===----------------------------------------------------------------------===//
	// X86 Instruction Format Definitions.
	//

	include "X86InstrFormats.td"

	//===----------------------------------------------------------------------===//
	// Pattern fragments.
	//

	// X86 specific condition code. These correspond to CondCode in
	// X86InstrInfo.h. They must be kept in synch.
	def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE
	def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC
	def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C
	def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA
	def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z
	def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE
	def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL
	def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE
	def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG
	def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ
	def X86_COND_NO : PatLeaf<(i8 10)>;
	def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO
	def X86_COND_NS : PatLeaf<(i8 12)>;
	def X86_COND_O : PatLeaf<(i8 13)>;
	def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE
	def X86_COND_S : PatLeaf<(i8 15)>;

	def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
	def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
	def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
	def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;

	// FIXME: Ideally we would just replace the above iimmSExt matchers with
	// relocImm-based matchers, but then FastISel would be unable to use them.
	def i64relocImmSExt8 : PatLeaf<(i64 relocImm), [{
	return isSExtRelocImm<8>(N);
	}]>;
	def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{
	return isSExtRelocImm<32>(N);
	}]>;

	// If we have multiple users of an immediate, it's much smaller to reuse
	// the register, rather than encode the immediate in every instruction.
	// This has the risk of increasing register pressure from stretched live
	// ranges, however, the immediates should be trivial to rematerialize by
	// the RA in the event of high register pressure.
	// TODO : This is currently enabled for stores and binary ops. There are more
	// cases for which this can be enabled, though this catches the bulk of the
	// issues.
	// TODO2 : This should really also be enabled under O2, but there's currently
	// an issue with RA where we don't pull the constants into their users
	// when we rematerialize them. I'll follow-up on enabling O2 after we fix that
	// issue.
	// TODO3 : This is currently limited to single basic blocks (DAG creation
	// pulls block immediates to the top and merges them if necessary).
	// Eventually, it would be nice to allow ConstantHoisting to merge constants
	// globally for potentially added savings.
	//
	def imm8_su : PatLeaf<(i8 relocImm), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;
	def imm16_su : PatLeaf<(i16 relocImm), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;
	def imm32_su : PatLeaf<(i32 relocImm), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;
	def i64immSExt32_su : PatLeaf<(i64immSExt32), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;

	def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;
	def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;
	def i64immSExt8_su : PatLeaf<(i64immSExt8), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;

	def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;
	def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{
	return !shouldAvoidImmediateInstFormsForSize(N);
	}]>;

	// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
	// unsigned field.
	def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>;

	def i64immZExt32SExt8 : ImmLeaf<i64, [{
	return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm));
	}]>;

	// Helper fragments for loads.
	// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
	// known to be 32-bit aligned or better. Ditto for i8 to i16.
	def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
	LoadSDNode *LD = cast<LoadSDNode>(N);
	ISD::LoadExtType ExtType = LD->getExtensionType();
	if (ExtType == ISD::NON_EXTLOAD)
	return true;
	if (ExtType == ISD::EXTLOAD)
	return LD->getAlignment() >= 2 && !LD->isVolatile();
	return false;
	}]>;

	def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{
	LoadSDNode *LD = cast<LoadSDNode>(N);
	ISD::LoadExtType ExtType = LD->getExtensionType();
	if (ExtType == ISD::EXTLOAD)
	return LD->getAlignment() >= 2 && !LD->isVolatile();
	return false;
	}]>;

	def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
	LoadSDNode *LD = cast<LoadSDNode>(N);
	ISD::LoadExtType ExtType = LD->getExtensionType();
	if (ExtType == ISD::NON_EXTLOAD)
	return true;
	if (ExtType == ISD::EXTLOAD)
	return LD->getAlignment() >= 4 && !LD->isVolatile();
	return false;
	}]>;

	def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>;
	def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
	def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
	def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
	def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
	def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>;

	def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
	def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
	def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
	def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
	def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
	def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;

	def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>;
	def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
	def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
	def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
	def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
	def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
	def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
	def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
	def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
	def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;

	def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>;
	def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
	def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
	def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
	def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
	def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
	def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
	def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
	def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
	def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;


	// An 'and' node with a single use.
	def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
	return N->hasOneUse();
	}]>;
	// An 'srl' node with a single use.
	def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
	return N->hasOneUse();
	}]>;
	// An 'trunc' node with a single use.
	def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
	return N->hasOneUse();
	}]>;

	//===----------------------------------------------------------------------===//
	// Instruction list.
	//

	// Nop
	let hasSideEffects = 0, SchedRW = [WriteZero] in {
	def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
	def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero),
	"nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
	def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero),
	"nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
	def NOOPQ : RI<0x1f, MRMXm, (outs), (ins i64mem:$zero),
	"nop{q}\t$zero", [], IIC_NOP>, TB,
	Requires<[In64BitMode]>;
	// Also allow register so we can assemble/disassemble
	def NOOPWr : I<0x1f, MRMXr, (outs), (ins GR16:$zero),
	"nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
	def NOOPLr : I<0x1f, MRMXr, (outs), (ins GR32:$zero),
	"nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
	def NOOPQr : RI<0x1f, MRMXr, (outs), (ins GR64:$zero),
	"nop{q}\t$zero", [], IIC_NOP>, TB,
	Requires<[In64BitMode]>;
	}


	// Constructing a stack frame.
	def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
	"enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>;

	let SchedRW = [WriteALU] in {
	let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in
	def LEAVE : I<0xC9, RawFrm,
	(outs), (ins), "leave", [], IIC_LEAVE>,
	Requires<[Not64BitMode]>;

	let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in
	def LEAVE64 : I<0xC9, RawFrm,
	(outs), (ins), "leave", [], IIC_LEAVE>,
	Requires<[In64BitMode]>;
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// Miscellaneous Instructions.
	//

	let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1,
	SchedRW = [WriteSystem] in
	def Int_eh_sjlj_setup_dispatch
	: PseudoI<(outs), (ins), [(X86eh_sjlj_setup_dispatch)]>;

	let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in {
	let mayLoad = 1, SchedRW = [WriteLoad] in {
	def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
	IIC_POP_REG16>, OpSize16;
	def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
	IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
	def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
	IIC_POP_REG>, OpSize16;
	def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
	IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
	} // mayLoad, SchedRW
	let mayStore = 1, mayLoad = 1, SchedRW = [WriteRMW] in {
	def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
	IIC_POP_MEM>, OpSize16;
	def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
	IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>;
	} // mayStore, mayLoad, WriteRMW

	let mayStore = 1, SchedRW = [WriteStore] in {
	def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
	IIC_PUSH_REG>, OpSize16;
	def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
	IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
	def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
	IIC_PUSH_REG>, OpSize16;
	def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
	IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;

	def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
	"push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
	def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
	"push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;

	def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
	"push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
	Requires<[Not64BitMode]>;
	def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
	"push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
	Requires<[Not64BitMode]>;
	} // mayStore, SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
	def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
	IIC_PUSH_MEM>, OpSize16;
	def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
	IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
	} // mayLoad, mayStore, SchedRW

	}

	let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
	SchedRW = [WriteRMW], Defs = [ESP] in {
	let Uses = [ESP] in
	def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins),
	[(set GR32:$dst, (int_x86_flags_read_u32))]>,
	Requires<[Not64BitMode]>;

	let Uses = [RSP] in
	def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins),
	[(set GR64:$dst, (int_x86_flags_read_u64))]>,
	Requires<[In64BitMode]>;
	}

	let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
	SchedRW = [WriteRMW] in {
	let Defs = [ESP, EFLAGS], Uses = [ESP] in
	def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src),
	[(int_x86_flags_write_u32 GR32:$src)]>,
	Requires<[Not64BitMode]>;

	let Defs = [RSP, EFLAGS], Uses = [RSP] in
	def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src),
	[(int_x86_flags_write_u64 GR64:$src)]>,
	Requires<[In64BitMode]>;
	}

	let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
	SchedRW = [WriteLoad] in {
	def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>,
	OpSize16;
	def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l\|d}", [], IIC_POP_FD>,
	OpSize32, Requires<[Not64BitMode]>;
	}

	let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, hasSideEffects=0,
	SchedRW = [WriteStore] in {
	def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
	OpSize16;
	def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l\|d}", [], IIC_PUSH_F>,
	OpSize32, Requires<[Not64BitMode]>;
	}

	let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in {
	let mayLoad = 1, SchedRW = [WriteLoad] in {
	def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
	IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
	def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
	IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
	} // mayLoad, SchedRW
	let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in
	def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
	IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>;
	let mayStore = 1, SchedRW = [WriteStore] in {
	def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
	IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
	def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
	IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
	} // mayStore, SchedRW
	let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
	def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
	IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>;
	} // mayLoad, mayStore, SchedRW
	}

	let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
	SchedRW = [WriteStore] in {
	def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
	"push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
	Requires<[In64BitMode]>;
	def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
	"push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
	Requires<[In64BitMode]>;
	}

	let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in
	def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
	OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>;
	let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, hasSideEffects=0 in
	def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
	OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>;

	let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
	mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in {
	def POPA32 : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>,
	OpSize32, Requires<[Not64BitMode]>;
	def POPA16 : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>,
	OpSize16, Requires<[Not64BitMode]>;
	}
	let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
	mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
	def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>,
	OpSize32, Requires<[Not64BitMode]>;
	def PUSHA16 : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>,
	OpSize16, Requires<[Not64BitMode]>;
	}

	let Constraints = "$src = $dst", SchedRW = [WriteALU] in {
	// GR32 = bswap GR32
	def BSWAP32r : I<0xC8, AddRegFrm,
	(outs GR32:$dst), (ins GR32:$src),
	"bswap{l}\t$dst",
	[(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, OpSize32, TB;

	def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
	"bswap{q}\t$dst",
	[(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
	} // Constraints = "$src = $dst", SchedRW

	// Bit scan instructions.
	let Defs = [EFLAGS] in {
	def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
	"bsf{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
	IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
	def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"bsf{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
	IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
	def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
	"bsf{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))],
	IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
	def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"bsf{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
	IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
	def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
	"bsf{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
	IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
	def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
	"bsf{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
	IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;

	def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
	"bsr{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))],
	IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
	def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"bsr{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
	IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
	def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
	"bsr{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))],
	IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
	def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"bsr{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
	IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
	def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
	"bsr{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))],
	IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
	def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
	"bsr{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
	IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
	} // Defs = [EFLAGS]

	let SchedRW = [WriteMicrocoded] in {
	// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
	let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
	def MOVSB : I<0xA4, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
	"movsb\t{$src, $dst\|$dst, $src}", [], IIC_MOVS>;
	def MOVSW : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
	"movsw\t{$src, $dst\|$dst, $src}", [], IIC_MOVS>, OpSize16;
	def MOVSL : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
	"movs{l\|d}\t{$src, $dst\|$dst, $src}", [], IIC_MOVS>, OpSize32;
	def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
	"movsq\t{$src, $dst\|$dst, $src}", [], IIC_MOVS>;
	}

	// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
	let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
	def STOSB : I<0xAA, RawFrmDst, (outs), (ins dstidx8:$dst),
	"stosb\t{%al, $dst\|$dst, al}", [], IIC_STOS>;
	let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
	def STOSW : I<0xAB, RawFrmDst, (outs), (ins dstidx16:$dst),
	"stosw\t{%ax, $dst\|$dst, ax}", [], IIC_STOS>, OpSize16;
	let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
	def STOSL : I<0xAB, RawFrmDst, (outs), (ins dstidx32:$dst),
	"stos{l\|d}\t{%eax, $dst\|$dst, eax}", [], IIC_STOS>, OpSize32;
	let Defs = [RDI], Uses = [RAX,RDI,EFLAGS] in
	def STOSQ : RI<0xAB, RawFrmDst, (outs), (ins dstidx64:$dst),
	"stosq\t{%rax, $dst\|$dst, rax}", [], IIC_STOS>;

	// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
	let Defs = [EDI,EFLAGS], Uses = [AL,EDI,EFLAGS] in
	def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst),
	"scasb\t{$dst, %al\|al, $dst}", [], IIC_SCAS>;
	let Defs = [EDI,EFLAGS], Uses = [AX,EDI,EFLAGS] in
	def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst),
	"scasw\t{$dst, %ax\|ax, $dst}", [], IIC_SCAS>, OpSize16;
	let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,EFLAGS] in
	def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst),
	"scas{l\|d}\t{$dst, %eax\|eax, $dst}", [], IIC_SCAS>, OpSize32;
	let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,EFLAGS] in
	def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst),
	"scasq\t{$dst, %rax\|rax, $dst}", [], IIC_SCAS>;

	// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
	let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,EFLAGS] in {
	def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
	"cmpsb\t{$dst, $src\|$src, $dst}", [], IIC_CMPS>;
	def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
	"cmpsw\t{$dst, $src\|$src, $dst}", [], IIC_CMPS>, OpSize16;
	def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
	"cmps{l\|d}\t{$dst, $src\|$src, $dst}", [], IIC_CMPS>, OpSize32;
	def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
	"cmpsq\t{$dst, $src\|$src, $dst}", [], IIC_CMPS>;
	}
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// Move Instructions.
	//
	let SchedRW = [WriteMove] in {
	let hasSideEffects = 0 in {
	def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
	"mov{b}\t{$src, $dst\|$dst, $src}", [], IIC_MOV>;
	def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}", [], IIC_MOV>, OpSize16;
	def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}", [], IIC_MOV>, OpSize32;
	def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
	"mov{q}\t{$src, $dst\|$dst, $src}", [], IIC_MOV>;
	}

	let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
	def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
	"mov{b}\t{$src, $dst\|$dst, $src}",
	[(set GR8:$dst, imm:$src)], IIC_MOV>;
	def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize16;
	def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, relocImm:$src)], IIC_MOV>, OpSize32;
	def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
	"mov{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
	}
	let isReMaterializable = 1 in {
	def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
	"movabs{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, relocImm:$src)], IIC_MOV>;
	}

	// Longer forms that use a ModR/M byte. Needed for disassembler
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
	def MOV8ri_alt : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
	"mov{b}\t{$src, $dst\|$dst, $src}", [], IIC_MOV>,
	FoldGenData<"MOV8ri">;
	def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}", [], IIC_MOV>, OpSize16,
	FoldGenData<"MOV16ri">;
	def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}", [], IIC_MOV>, OpSize32,
	FoldGenData<"MOV32ri">;
	}
	} // SchedRW

	let SchedRW = [WriteStore] in {
	def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
	"mov{b}\t{$src, $dst\|$dst, $src}",
	[(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>;
	def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}",
	[(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
	def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}",
	[(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
	def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
	"mov{q}\t{$src, $dst\|$dst, $src}",
	[(store i64immSExt32_su:$src, addr:$dst)], IIC_MOV_MEM>,
	Requires<[In64BitMode]>;
	} // SchedRW

	let hasSideEffects = 0 in {

	/// Memory offset versions of moves. The immediate is an address mode sized
	/// offset from the segment base.
	let SchedRW = [WriteALU] in {
	let mayLoad = 1 in {
	let Defs = [AL] in
	def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src),
	"mov{b}\t{$src, %al\|al, $src}", [], IIC_MOV_MEM>,
	AdSize32;
	let Defs = [AX] in
	def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src),
	"mov{w}\t{$src, %ax\|ax, $src}", [], IIC_MOV_MEM>,
	OpSize16, AdSize32;
	let Defs = [EAX] in
	def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src),
	"mov{l}\t{$src, %eax\|eax, $src}", [], IIC_MOV_MEM>,
	OpSize32, AdSize32;
	let Defs = [RAX] in
	def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src),
	"mov{q}\t{$src, %rax\|rax, $src}", [], IIC_MOV_MEM>,
	AdSize32;

	let Defs = [AL] in
	def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src),
	"mov{b}\t{$src, %al\|al, $src}", [], IIC_MOV_MEM>, AdSize16;
	let Defs = [AX] in
	def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src),
	"mov{w}\t{$src, %ax\|ax, $src}", [], IIC_MOV_MEM>,
	OpSize16, AdSize16;
	let Defs = [EAX] in
	def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src),
	"mov{l}\t{$src, %eax\|eax, $src}", [], IIC_MOV_MEM>,
	AdSize16, OpSize32;
	}
	let mayStore = 1 in {
	let Uses = [AL] in
	def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs), (ins offset32_8:$dst),
	"mov{b}\t{%al, $dst\|$dst, al}", [], IIC_MOV_MEM>, AdSize32;
	let Uses = [AX] in
	def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_16:$dst),
	"mov{w}\t{%ax, $dst\|$dst, ax}", [], IIC_MOV_MEM>,
	OpSize16, AdSize32;
	let Uses = [EAX] in
	def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_32:$dst),
	"mov{l}\t{%eax, $dst\|$dst, eax}", [], IIC_MOV_MEM>,
	OpSize32, AdSize32;
	let Uses = [RAX] in
	def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs), (ins offset32_64:$dst),
	"mov{q}\t{%rax, $dst\|$dst, rax}", [], IIC_MOV_MEM>,
	AdSize32;

	let Uses = [AL] in
	def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs), (ins offset16_8:$dst),
	"mov{b}\t{%al, $dst\|$dst, al}", [], IIC_MOV_MEM>, AdSize16;
	let Uses = [AX] in
	def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_16:$dst),
	"mov{w}\t{%ax, $dst\|$dst, ax}", [], IIC_MOV_MEM>,
	OpSize16, AdSize16;
	let Uses = [EAX] in
	def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_32:$dst),
	"mov{l}\t{%eax, $dst\|$dst, eax}", [], IIC_MOV_MEM>,
	OpSize32, AdSize16;
	}
	}

	// These forms all have full 64-bit absolute addresses in their instructions
	// and use the movabs mnemonic to indicate this specific form.
	let mayLoad = 1 in {
	let Defs = [AL] in
	def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src),
	"movabs{b}\t{$src, %al\|al, $src}", [], IIC_MOV_MEM>,
	AdSize64;
	let Defs = [AX] in
	def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src),
	"movabs{w}\t{$src, %ax\|ax, $src}", [], IIC_MOV_MEM>,
	OpSize16, AdSize64;
	let Defs = [EAX] in
	def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src),
	"movabs{l}\t{$src, %eax\|eax, $src}", [], IIC_MOV_MEM>,
	OpSize32, AdSize64;
	let Defs = [RAX] in
	def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src),
	"movabs{q}\t{$src, %rax\|rax, $src}", [], IIC_MOV_MEM>,
	AdSize64;
	}

	let mayStore = 1 in {
	let Uses = [AL] in
	def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst),
	"movabs{b}\t{%al, $dst\|$dst, al}", [], IIC_MOV_MEM>,
	AdSize64;
	let Uses = [AX] in
	def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst),
	"movabs{w}\t{%ax, $dst\|$dst, ax}", [], IIC_MOV_MEM>,
	OpSize16, AdSize64;
	let Uses = [EAX] in
	def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst),
	"movabs{l}\t{%eax, $dst\|$dst, eax}", [], IIC_MOV_MEM>,
	OpSize32, AdSize64;
	let Uses = [RAX] in
	def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst),
	"movabs{q}\t{%rax, $dst\|$dst, rax}", [], IIC_MOV_MEM>,
	AdSize64;
	}
	} // hasSideEffects = 0

	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
	SchedRW = [WriteMove] in {
	def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
	"mov{b}\t{$src, $dst\|$dst, $src}", [], IIC_MOV>,
	FoldGenData<"MOV8rr">;
	def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}", [], IIC_MOV>, OpSize16,
	FoldGenData<"MOV16rr">;
	def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}", [], IIC_MOV>, OpSize32,
	FoldGenData<"MOV32rr">;
	def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
	"mov{q}\t{$src, $dst\|$dst, $src}", [], IIC_MOV>,
	FoldGenData<"MOV64rr">;
	}

	let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
	def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
	"mov{b}\t{$src, $dst\|$dst, $src}",
	[(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
	def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize16;
	def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>, OpSize32;
	def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
	"mov{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
	}

	let SchedRW = [WriteStore] in {
	def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
	"mov{b}\t{$src, $dst\|$dst, $src}",
	[(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
	def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
	"mov{w}\t{$src, $dst\|$dst, $src}",
	[(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize16;
	def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
	"mov{l}\t{$src, $dst\|$dst, $src}",
	[(store GR32:$src, addr:$dst)], IIC_MOV_MEM>, OpSize32;
	def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
	"mov{q}\t{$src, $dst\|$dst, $src}",
	[(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
	} // SchedRW

	// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
	// that they can be used for copying and storing h registers, which can't be
	// encoded when a REX prefix is present.
	let isCodeGenOnly = 1 in {
	let hasSideEffects = 0 in
	def MOV8rr_NOREX : I<0x88, MRMDestReg,
	(outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
	"mov{b}\t{$src, $dst\|$dst, $src} # NOREX", [], IIC_MOV>,
	Sched<[WriteMove]>;
	let mayStore = 1, hasSideEffects = 0 in
	def MOV8mr_NOREX : I<0x88, MRMDestMem,
	(outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
	"mov{b}\t{$src, $dst\|$dst, $src} # NOREX", [],
	IIC_MOV_MEM>, Sched<[WriteStore]>;
	let mayLoad = 1, hasSideEffects = 0,
	canFoldAsLoad = 1, isReMaterializable = 1 in
	def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
	(outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
	"mov{b}\t{$src, $dst\|$dst, $src} # NOREX", [],
	IIC_MOV_MEM>, Sched<[WriteLoad]>;
	}


	// Condition code ops, incl. set if equal/not equal/...
	let SchedRW = [WriteALU] in {
	let Defs = [EFLAGS], Uses = [AH] in
	def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf",
	[(set EFLAGS, (X86sahf AH))], IIC_AHF>,
	Requires<[HasLAHFSAHF]>;
	let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
	def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [],
	IIC_AHF>, // AH = flags
	Requires<[HasLAHFSAHF]>;
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// Bit tests instructions: BT, BTS, BTR, BTC.

	let Defs = [EFLAGS] in {
	let SchedRW = [WriteALU] in {
	def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
	"bt{w}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
	OpSize16, TB, NotMemoryFoldable;
	def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
	"bt{l}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>,
	OpSize32, TB, NotMemoryFoldable;
	def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
	"bt{q}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB,
	NotMemoryFoldable;
	} // SchedRW

	// Unlike with the register+register form, the memory+register form of the
	// bt instruction does not ignore the high bits of the index. From ISel's
	// perspective, this is pretty bizarre. Make these instructions disassembly
	// only for now. These instructions are also slow on modern CPUs so that's
	// another reason to avoid generating them.

	let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
	def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
	"bt{w}\t{$src2, $src1\|$src1, $src2}",
	[], IIC_BT_MR
	>, OpSize16, TB, NotMemoryFoldable;
	def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
	"bt{l}\t{$src2, $src1\|$src1, $src2}",
	[], IIC_BT_MR
	>, OpSize32, TB, NotMemoryFoldable;
	def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
	"bt{q}\t{$src2, $src1\|$src1, $src2}",
	[], IIC_BT_MR
	>, TB, NotMemoryFoldable;
	}

	let SchedRW = [WriteALU] in {
	def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
	"bt{w}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
	IIC_BT_RI>, OpSize16, TB;
	def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
	"bt{l}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
	IIC_BT_RI>, OpSize32, TB;
	def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
	"bt{q}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
	IIC_BT_RI>, TB;
	} // SchedRW

	// Note that these instructions aren't slow because that only applies when the
	// other operand is in a register. When it's an immediate, bt is still fast.
	let SchedRW = [WriteALU] in {
	def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
	"bt{w}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
	], IIC_BT_MI>, OpSize16, TB;
	def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
	"bt{l}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
	], IIC_BT_MI>, OpSize32, TB;
	def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
	"bt{q}\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86bt (loadi64 addr:$src1),
	i64immSExt8:$src2))], IIC_BT_MI>, TB,
	Requires<[In64BitMode]>;
	} // SchedRW

	let hasSideEffects = 0 in {
	let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
	def BTC16rr : I<0xBB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
	"btc{w}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RR>,
	OpSize16, TB, NotMemoryFoldable;
	def BTC32rr : I<0xBB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
	"btc{l}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RR>,
	OpSize32, TB, NotMemoryFoldable;
	def BTC64rr : RI<0xBB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
	"btc{q}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RR>, TB,
	NotMemoryFoldable;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
	def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
	"btc{w}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MR>,
	OpSize16, TB, NotMemoryFoldable;
	def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
	"btc{l}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MR>,
	OpSize32, TB, NotMemoryFoldable;
	def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
	"btc{q}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MR>, TB,
	NotMemoryFoldable;
	}

	let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
	def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
	"btc{w}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RI>,
	OpSize16, TB;
	def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
	"btc{l}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RI>,
	OpSize32, TB;
	def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
	"btc{q}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RI>, TB;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
	def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
	"btc{w}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MI>,
	OpSize16, TB;
	def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
	"btc{l}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MI>,
	OpSize32, TB;
	def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
	"btc{q}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MI>, TB,
	Requires<[In64BitMode]>;
	}

	let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
	def BTR16rr : I<0xB3, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
	"btr{w}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RR>,
	OpSize16, TB, NotMemoryFoldable;
	def BTR32rr : I<0xB3, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
	"btr{l}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RR>,
	OpSize32, TB, NotMemoryFoldable;
	def BTR64rr : RI<0xB3, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
	"btr{q}\t{$src2, $src1\|$src1, $src2}", []>, TB, NotMemoryFoldable;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
	def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
	"btr{w}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MR>,
	OpSize16, TB, NotMemoryFoldable;
	def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
	"btr{l}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MR>,
	OpSize32, TB, NotMemoryFoldable;
	def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
	"btr{q}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MR>, TB,
	NotMemoryFoldable;
	}

	let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
	def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
	"btr{w}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RI>,
	OpSize16, TB;
	def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
	"btr{l}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RI>,
	OpSize32, TB;
	def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
	"btr{q}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RI>, TB;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
	def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
	"btr{w}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MI>,
	OpSize16, TB;
	def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
	"btr{l}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MI>,
	OpSize32, TB;
	def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
	"btr{q}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MI>, TB,
	Requires<[In64BitMode]>;
	}

	let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
	def BTS16rr : I<0xAB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
	"bts{w}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RR>,
	OpSize16, TB, NotMemoryFoldable;
	def BTS32rr : I<0xAB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
	"bts{l}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RR>,
	OpSize32, TB, NotMemoryFoldable;
	def BTS64rr : RI<0xAB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
	"bts{q}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RR>, TB,
	NotMemoryFoldable;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
	def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
	"bts{w}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MR>,
	OpSize16, TB, NotMemoryFoldable;
	def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
	"bts{l}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MR>,
	OpSize32, TB, NotMemoryFoldable;
	def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
	"bts{q}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MR>, TB,
	NotMemoryFoldable;
	}

	let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
	def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
	"bts{w}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RI>,
	OpSize16, TB;
	def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
	"bts{l}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RI>,
	OpSize32, TB;
	def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
	"bts{q}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_RI>, TB;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
	def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
	"bts{w}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MI>,
	OpSize16, TB;
	def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
	"bts{l}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MI>,
	OpSize32, TB;
	def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
	"bts{q}\t{$src2, $src1\|$src1, $src2}", [], IIC_BTX_MI>, TB,
	Requires<[In64BitMode]>;
	}
	} // hasSideEffects = 0
	} // Defs = [EFLAGS]


	//===----------------------------------------------------------------------===//
	// Atomic support
	//

	// Atomic swap. These are just normal xchg instructions. But since a memory
	// operand is referenced, the atomicity is ensured.
	multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
	InstrItinClass itin> {
	let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in {
	def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst),
	(ins GR8:$val, i8mem:$ptr),
	!strconcat(mnemonic, "{b}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR8:$dst,
	(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
	itin>;
	def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
	(ins GR16:$val, i16mem:$ptr),
	!strconcat(mnemonic, "{w}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR16:$dst,
	(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
	itin>, OpSize16;
	def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
	(ins GR32:$val, i32mem:$ptr),
	!strconcat(mnemonic, "{l}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR32:$dst,
	(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
	itin>, OpSize32;
	def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
	(ins GR64:$val, i64mem:$ptr),
	!strconcat(mnemonic, "{q}\t{$val, $ptr\|$ptr, $val}"),
	[(set
	GR64:$dst,
	(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
	itin>;
	}
	}

	defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;

	// Swap between registers.
	let SchedRW = [WriteALU] in {
	let Constraints = "$val = $dst" in {
	def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
	"xchg{b}\t{$val, $src\|$src, $val}", [], IIC_XCHG_REG>;
	def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
	"xchg{w}\t{$val, $src\|$src, $val}", [], IIC_XCHG_REG>,
	OpSize16;
	def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
	"xchg{l}\t{$val, $src\|$src, $val}", [], IIC_XCHG_REG>,
	OpSize32;
	def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
	"xchg{q}\t{$val, $src\|$src, $val}", [], IIC_XCHG_REG>;
	}

	// Swap between EAX and other registers.
	let Uses = [AX], Defs = [AX] in
	def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
	"xchg{w}\t{$src, %ax\|ax, $src}", [], IIC_XCHG_REG>, OpSize16;
	let Uses = [EAX], Defs = [EAX] in
	def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
	"xchg{l}\t{$src, %eax\|eax, $src}", [], IIC_XCHG_REG>,
	OpSize32, Requires<[Not64BitMode]>;
	let Uses = [EAX], Defs = [EAX] in
	// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
	// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
	def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
	"xchg{l}\t{$src, %eax\|eax, $src}", [], IIC_XCHG_REG>,
	OpSize32, Requires<[In64BitMode]>;
	let Uses = [RAX], Defs = [RAX] in
	def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
	"xchg{q}\t{$src, %rax\|rax, $src}", [], IIC_XCHG_REG>;
	} // SchedRW

	let SchedRW = [WriteALU] in {
	def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
	"xadd{b}\t{$src, $dst\|$dst, $src}", [], IIC_XADD_REG>, TB;
	def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
	"xadd{w}\t{$src, $dst\|$dst, $src}", [], IIC_XADD_REG>, TB,
	OpSize16;
	def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
	"xadd{l}\t{$src, $dst\|$dst, $src}", [], IIC_XADD_REG>, TB,
	OpSize32;
	def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
	"xadd{q}\t{$src, $dst\|$dst, $src}", [], IIC_XADD_REG>, TB;
	} // SchedRW

	let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
	def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
	"xadd{b}\t{$src, $dst\|$dst, $src}", [], IIC_XADD_MEM>, TB;
	def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
	"xadd{w}\t{$src, $dst\|$dst, $src}", [], IIC_XADD_MEM>, TB,
	OpSize16;
	def XADD32rm : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
	"xadd{l}\t{$src, $dst\|$dst, $src}", [], IIC_XADD_MEM>, TB,
	OpSize32;
	def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
	"xadd{q}\t{$src, $dst\|$dst, $src}", [], IIC_XADD_MEM>, TB;

	}

	let SchedRW = [WriteALU] in {
	def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
	"cmpxchg{b}\t{$src, $dst\|$dst, $src}", [],
	IIC_CMPXCHG_REG8>, TB;
	def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
	"cmpxchg{w}\t{$src, $dst\|$dst, $src}", [],
	IIC_CMPXCHG_REG>, TB, OpSize16;
	def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
	"cmpxchg{l}\t{$src, $dst\|$dst, $src}", [],
	IIC_CMPXCHG_REG>, TB, OpSize32;
	def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
	"cmpxchg{q}\t{$src, $dst\|$dst, $src}", [],
	IIC_CMPXCHG_REG>, TB;
	} // SchedRW

	let SchedRW = [WriteALULd, WriteRMW] in {
	let mayLoad = 1, mayStore = 1 in {
	def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
	"cmpxchg{b}\t{$src, $dst\|$dst, $src}", [],
	IIC_CMPXCHG_MEM8>, TB;
	def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
	"cmpxchg{w}\t{$src, $dst\|$dst, $src}", [],
	IIC_CMPXCHG_MEM>, TB, OpSize16;
	def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
	"cmpxchg{l}\t{$src, $dst\|$dst, $src}", [],
	IIC_CMPXCHG_MEM>, TB, OpSize32;
	def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
	"cmpxchg{q}\t{$src, $dst\|$dst, $src}", [],
	IIC_CMPXCHG_MEM>, TB;
	}

	let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
	def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
	"cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB;

	let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
	def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
	"cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
	TB, Requires<[HasCmpxchg16b, In64BitMode]>;
	} // SchedRW


	// Lock instruction prefix
	let SchedRW = [WriteMicrocoded] in
	def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>;

	let SchedRW = [WriteNop] in {

	// Rex64 instruction prefix
	def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", [], IIC_NOP>,
	Requires<[In64BitMode]>;

	// Data16 instruction prefix
	def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", [], IIC_NOP>,
	Requires<[Not16BitMode]>;

	// Data instruction prefix
	def DATA32_PREFIX : I<0x66, RawFrm, (outs), (ins), "data32", [], IIC_NOP>,
	Requires<[In16BitMode]>;
	} // SchedRW

	// Repeat string operation instruction prefixes
	// These use the DF flag in the EFLAGS register to inc or dec ECX
	let Defs = [ECX], Uses = [ECX,EFLAGS], SchedRW = [WriteMicrocoded] in {
	// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
	def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>;
	// Repeat while not equal (used with CMPS and SCAS)
	def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>;
	}

	// String manipulation instructions
	let SchedRW = [WriteMicrocoded] in {
	// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
	let Defs = [AL,ESI], Uses = [ESI,EFLAGS] in
	def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src),
	"lodsb\t{$src, %al\|al, $src}", [], IIC_LODS>;
	let Defs = [AX,ESI], Uses = [ESI,EFLAGS] in
	def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src),
	"lodsw\t{$src, %ax\|ax, $src}", [], IIC_LODS>, OpSize16;
	let Defs = [EAX,ESI], Uses = [ESI,EFLAGS] in
	def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src),
	"lods{l\|d}\t{$src, %eax\|eax, $src}", [], IIC_LODS>, OpSize32;
	let Defs = [RAX,ESI], Uses = [ESI,EFLAGS] in
	def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src),
	"lodsq\t{$src, %rax\|rax, $src}", [], IIC_LODS>;
	}

	let SchedRW = [WriteSystem] in {
	// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
	let Defs = [ESI], Uses = [DX,ESI,EFLAGS] in {
	def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src),
	"outsb\t{$src, %dx\|dx, $src}", [], IIC_OUTS>;
	def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src),
	"outsw\t{$src, %dx\|dx, $src}", [], IIC_OUTS>, OpSize16;
	def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src),
	"outs{l\|d}\t{$src, %dx\|dx, $src}", [], IIC_OUTS>, OpSize32;
	}

	// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
	let Defs = [EDI], Uses = [DX,EDI,EFLAGS] in {
	def INSB : I<0x6C, RawFrmDst, (outs), (ins dstidx8:$dst),
	"insb\t{%dx, $dst\|$dst, dx}", [], IIC_INS>;
	def INSW : I<0x6D, RawFrmDst, (outs), (ins dstidx16:$dst),
	"insw\t{%dx, $dst\|$dst, dx}", [], IIC_INS>, OpSize16;
	def INSL : I<0x6D, RawFrmDst, (outs), (ins dstidx32:$dst),
	"ins{l\|d}\t{%dx, $dst\|$dst, dx}", [], IIC_INS>, OpSize32;
	}
	}

	// Flag instructions
	let SchedRW = [WriteALU] in {
	def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
	def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
	def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
	def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
	def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
	def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
	def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;

	def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
	}

	// Table lookup instructions
	let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in
	def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
	Sched<[WriteLoad]>;

	let SchedRW = [WriteMicrocoded] in {
	// ASCII Adjust After Addition
	let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
	def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
	Requires<[Not64BitMode]>;

	// ASCII Adjust AX Before Division
	let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in
	def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
	"aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>;

	// ASCII Adjust AX After Multiply
	let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in
	def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
	"aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>;

	// ASCII Adjust AL After Subtraction - sets
	let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
	def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
	Requires<[Not64BitMode]>;

	// Decimal Adjust AL after Addition
	let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
	def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
	Requires<[Not64BitMode]>;

	// Decimal Adjust AL after Subtraction
	let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
	def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
	Requires<[Not64BitMode]>;
	} // SchedRW

	let SchedRW = [WriteSystem] in {
	// Check Array Index Against Bounds
	def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"bound\t{$src, $dst\|$dst, $src}", [], IIC_BOUND>, OpSize16,
	Requires<[Not64BitMode]>;
	def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"bound\t{$src, $dst\|$dst, $src}", [], IIC_BOUND>, OpSize32,
	Requires<[Not64BitMode]>;

	// Adjust RPL Field of Segment Selector
	def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
	"arpl\t{$src, $dst\|$dst, $src}", [], IIC_ARPL_REG>,
	Requires<[Not64BitMode]>;
	let mayStore = 1 in
	def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
	"arpl\t{$src, $dst\|$dst, $src}", [], IIC_ARPL_MEM>,
	Requires<[Not64BitMode]>;
	} // SchedRW

	//===----------------------------------------------------------------------===//
	// MOVBE Instructions
	//
	let Predicates = [HasMOVBE] in {
	let SchedRW = [WriteALULd] in {
	def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"movbe{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
	OpSize16, T8PS;
	def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"movbe{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
	OpSize32, T8PS;
	def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
	"movbe{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
	T8PS;
	}
	let SchedRW = [WriteStore] in {
	def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
	"movbe{w}\t{$src, $dst\|$dst, $src}",
	[(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
	OpSize16, T8PS;
	def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
	"movbe{l}\t{$src, $dst\|$dst, $src}",
	[(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
	OpSize32, T8PS;
	def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
	"movbe{q}\t{$src, $dst\|$dst, $src}",
	[(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
	T8PS;
	}
	}

	//===----------------------------------------------------------------------===//
	// RDRAND Instruction
	//
	let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
	def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
	"rdrand{w}\t$dst",
	[(set GR16:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>,
	OpSize16, PS;
	def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
	"rdrand{l}\t$dst",
	[(set GR32:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>,
	OpSize32, PS;
	def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
	"rdrand{q}\t$dst",
	[(set GR64:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>, PS;
	}

	//===----------------------------------------------------------------------===//
	// RDSEED Instruction
	//
	let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
	def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins),
	"rdseed{w}\t$dst",
	[(set GR16:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>,
	OpSize16, PS;
	def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
	"rdseed{l}\t$dst",
	[(set GR32:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>,
	OpSize32, PS;
	def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins),
	"rdseed{q}\t$dst",
	[(set GR64:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>, PS;
	}

	//===----------------------------------------------------------------------===//
	// LZCNT Instruction
	//
	let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
	def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
	"lzcnt{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)],
	IIC_LZCNT_RR>, XS, OpSize16, Sched<[WriteIMul]>;
	def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"lzcnt{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (ctlz (loadi16 addr:$src))),
	(implicit EFLAGS)], IIC_LZCNT_RM>, XS, OpSize16,
	Sched<[WriteIMulLd]>;

	def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
	"lzcnt{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)],
	IIC_LZCNT_RR>, XS, OpSize32, Sched<[WriteIMul]>;
	def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"lzcnt{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (ctlz (loadi32 addr:$src))),
	(implicit EFLAGS)], IIC_LZCNT_RM>, XS, OpSize32,
	Sched<[WriteIMulLd]>;

	def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
	"lzcnt{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)],
	IIC_LZCNT_RR>, XS, Sched<[WriteIMul]>;
	def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
	"lzcnt{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (ctlz (loadi64 addr:$src))),
	(implicit EFLAGS)], IIC_LZCNT_RM>, XS,
	Sched<[WriteIMulLd]>;
	}

	//===----------------------------------------------------------------------===//
	// BMI Instructions
	//
	let Predicates = [HasBMI], Defs = [EFLAGS] in {
	def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
	"tzcnt{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)],
	IIC_TZCNT_RR>, XS, OpSize16, Sched<[WriteIMul]>;
	def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"tzcnt{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (cttz (loadi16 addr:$src))),
	(implicit EFLAGS)], IIC_TZCNT_RM>, XS, OpSize16,
	Sched<[WriteIMulLd]>;

	def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
	"tzcnt{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)],
	IIC_TZCNT_RR>, XS, OpSize32, Sched<[WriteIMul]>;
	def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"tzcnt{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (cttz (loadi32 addr:$src))),
	(implicit EFLAGS)], IIC_TZCNT_RM>, XS, OpSize32,
	Sched<[WriteIMulLd]>;

	def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
	"tzcnt{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)],
	IIC_TZCNT_RR>, XS, Sched<[WriteIMul]>;
	def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
	"tzcnt{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (cttz (loadi64 addr:$src))),
	(implicit EFLAGS)], IIC_TZCNT_RM>, XS,
	Sched<[WriteIMulLd]>;
	}

	multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
	RegisterClass RC, X86MemOperand x86memop> {
	let hasSideEffects = 0 in {
	def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
	!strconcat(mnemonic, "\t{$src, $dst\|$dst, $src}"),
	[], IIC_UNARY_REG>, T8PS, VEX_4V, Sched<[WriteALU]>;
	let mayLoad = 1 in
	def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
	!strconcat(mnemonic, "\t{$src, $dst\|$dst, $src}"),
	[], IIC_UNARY_MEM>, T8PS, VEX_4V, Sched<[WriteALULd, ReadAfterLd]>;
	}
	}

	let Predicates = [HasBMI], Defs = [EFLAGS] in {
	defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>;
	defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W;
	defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>;
	defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W;
	defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>;
	defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W;
	}

	//===----------------------------------------------------------------------===//
	// Pattern fragments to auto generate BMI instructions.
	//===----------------------------------------------------------------------===//

	let Predicates = [HasBMI] in {
	// FIXME: patterns for the load versions are not implemented
	def : Pat<(and GR32:$src, (add GR32:$src, -1)),
	(BLSR32rr GR32:$src)>;
	def : Pat<(and GR64:$src, (add GR64:$src, -1)),
	(BLSR64rr GR64:$src)>;

	def : Pat<(xor GR32:$src, (add GR32:$src, -1)),
	(BLSMSK32rr GR32:$src)>;
	def : Pat<(xor GR64:$src, (add GR64:$src, -1)),
	(BLSMSK64rr GR64:$src)>;

	def : Pat<(and GR32:$src, (ineg GR32:$src)),
	(BLSI32rr GR32:$src)>;
	def : Pat<(and GR64:$src, (ineg GR64:$src)),
	(BLSI64rr GR64:$src)>;
	}

	multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
	X86MemOperand x86memop, Intrinsic Int,
	PatFrag ld_frag> {
	def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
	!strconcat(mnemonic, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)], IIC_BIN_NONMEM>,
	T8PS, VEX, Sched<[WriteALU]>;
	def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
	!strconcat(mnemonic, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
	(implicit EFLAGS)], IIC_BIN_MEM>, T8PS, VEX,
	Sched<[WriteALULd, ReadAfterLd]>;
	}

	let Predicates = [HasBMI], Defs = [EFLAGS] in {
	defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem,
	int_x86_bmi_bextr_32, loadi32>;
	defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem,
	int_x86_bmi_bextr_64, loadi64>, VEX_W;
	}

	let Predicates = [HasBMI2], Defs = [EFLAGS] in {
	defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
	int_x86_bmi_bzhi_32, loadi32>;
	defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
	int_x86_bmi_bzhi_64, loadi64>, VEX_W;
	}

	def CountTrailingOnes : SDNodeXForm<imm, [{
	// Count the trailing ones in the immediate.
	return getI8Imm(countTrailingOnes(N->getZExtValue()), SDLoc(N));
	}]>;

	def BEXTRMaskXForm : SDNodeXForm<imm, [{
	unsigned Length = countTrailingOnes(N->getZExtValue());
	return getI32Imm(Length << 8, SDLoc(N));
	}]>;

	def AndMask64 : ImmLeaf<i64, [{
	return isMask_64(Imm) && Imm > UINT32_MAX;
	}]>;

	// Use BEXTR for 64-bit 'and' with large immediate 'mask'.
	let Predicates = [HasBMI, NoBMI2, NoTBM] in {
	def : Pat<(and GR64:$src, AndMask64:$mask),
	(BEXTR64rr GR64:$src,
	(SUBREG_TO_REG (i64 0),
	(MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
	def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
	(BEXTR64rm addr:$src,
	(SUBREG_TO_REG (i64 0),
	(MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
	}

	// Use BZHI for 64-bit 'and' with large immediate 'mask'.
	let Predicates = [HasBMI2, NoTBM] in {
	def : Pat<(and GR64:$src, AndMask64:$mask),
	(BZHI64rr GR64:$src,
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
	(MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
	def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
	(BZHI64rm addr:$src,
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
	(MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
	}

	let Predicates = [HasBMI2] in {
	def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)),
	(BZHI32rr GR32:$src,
	(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;

	def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)),
	(BZHI32rm addr:$src,
	(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;

	def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)),
	(BZHI64rr GR64:$src,
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;

	def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
	(BZHI64rm addr:$src,
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;

	// x & (-1 >> (32 - y))
	def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
	(BZHI32rr GR32:$src, GR32:$lz)>;
	def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
	(BZHI32rm addr:$src, GR32:$lz)>;

	// x & (-1 >> (64 - y))
	def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
	(BZHI64rr GR64:$src,
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
	def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
	(BZHI64rm addr:$src,
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;

	// x << (32 - y) >> (32 - y)
	def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))),
	(i8 (trunc (sub 32, GR32:$lz)))),
	(BZHI32rr GR32:$src, GR32:$lz)>;
	def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))),
	(i8 (trunc (sub 32, GR32:$lz)))),
	(BZHI32rm addr:$src, GR32:$lz)>;

	// x << (64 - y) >> (64 - y)
	def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))),
	(i8 (trunc (sub 64, GR32:$lz)))),
	(BZHI64rr GR64:$src,
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
	def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))),
	(i8 (trunc (sub 64, GR32:$lz)))),
	(BZHI64rm addr:$src,
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
	} // HasBMI2

	multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
	X86MemOperand x86memop, Intrinsic Int,
	PatFrag ld_frag> {
	def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
	!strconcat(mnemonic, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (Int RC:$src1, RC:$src2))], IIC_BIN_NONMEM>,
	VEX_4V, Sched<[WriteALU]>;
	def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
	!strconcat(mnemonic, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))],
	IIC_BIN_MEM>, VEX_4V, Sched<[WriteALULd, ReadAfterLd]>;
	}

	let Predicates = [HasBMI2] in {
	defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
	int_x86_bmi_pdep_32, loadi32>, T8XD;
	defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
	int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W;
	defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
	int_x86_bmi_pext_32, loadi32>, T8XS;
	defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
	int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W;
	}

	//===----------------------------------------------------------------------===//
	// TBM Instructions
	//
	let Predicates = [HasTBM], Defs = [EFLAGS] in {

	multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr,
	X86MemOperand x86memop, PatFrag ld_frag,
	Intrinsic Int, Operand immtype,
	SDPatternOperator immoperator> {
	def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
	!strconcat(OpcodeStr,
	"\t{$cntl, $src1, $dst\|$dst, $src1, $cntl}"),
	[(set RC:$dst, (Int RC:$src1, immoperator:$cntl))],
	IIC_BIN_NONMEM>, XOP, XOPA, Sched<[WriteALU]>;
	def mi : Ii32<opc, MRMSrcMem, (outs RC:$dst),
	(ins x86memop:$src1, immtype:$cntl),
	!strconcat(OpcodeStr,
	"\t{$cntl, $src1, $dst\|$dst, $src1, $cntl}"),
	[(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))],
	IIC_BIN_MEM>, XOP, XOPA, Sched<[WriteALULd, ReadAfterLd]>;
	}

	defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32,
	int_x86_tbm_bextri_u32, i32imm, imm>;
	let ImmT = Imm32S in
	defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64,
	int_x86_tbm_bextri_u64, i64i32imm,
	i64immSExt32>, VEX_W;

	multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
	RegisterClass RC, string OpcodeStr,
	X86MemOperand x86memop, PatFrag ld_frag> {
	let hasSideEffects = 0 in {
	def rr : I<opc, FormReg, (outs RC:$dst), (ins RC:$src),
	!strconcat(OpcodeStr,"\t{$src, $dst\|$dst, $src}"),
	[], IIC_BIN_NONMEM>, XOP_4V, XOP9, Sched<[WriteALU]>;
	let mayLoad = 1 in
	def rm : I<opc, FormMem, (outs RC:$dst), (ins x86memop:$src),
	!strconcat(OpcodeStr,"\t{$src, $dst\|$dst, $src}"),
	[], IIC_BIN_MEM>, XOP_4V, XOP9, Sched<[WriteALULd, ReadAfterLd]>;
	}
	}

	multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr,
	Format FormReg, Format FormMem> {
	defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr, i32mem,
	loadi32>;
	defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr, i64mem,
	loadi64>, VEX_W;
	}

	defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>;
	defm BLCI : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>;
	defm BLCIC : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>;
	defm BLCMSK : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>;
	defm BLCS : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>;
	defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>;
	defm BLSIC : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>;
	defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>;
	defm TZMSK : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
	} // HasTBM, EFLAGS

	// Use BEXTRI for 64-bit 'and' with large immediate 'mask'.
	let Predicates = [HasTBM] in {
	def : Pat<(and GR64:$src, AndMask64:$mask),
	(BEXTRI64ri GR64:$src, (BEXTRMaskXForm imm:$mask))>;

	def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
	(BEXTRI64mi addr:$src, (BEXTRMaskXForm imm:$mask))>;
	}

	//===----------------------------------------------------------------------===//
	// Lightweight Profiling Instructions

	let Predicates = [HasLWP], SchedRW = [WriteSystem] in {

	def LLWPCB : I<0x12, MRM0r, (outs), (ins GR32:$src), "llwpcb\t$src",
	[(int_x86_llwpcb GR32:$src)], IIC_LWP>,
	XOP, XOP9;
	def SLWPCB : I<0x12, MRM1r, (outs GR32:$dst), (ins), "slwpcb\t$dst",
	[(set GR32:$dst, (int_x86_slwpcb))], IIC_LWP>,
	XOP, XOP9;

	def LLWPCB64 : I<0x12, MRM0r, (outs), (ins GR64:$src), "llwpcb\t$src",
	[(int_x86_llwpcb GR64:$src)], IIC_LWP>,
	XOP, XOP9, VEX_W;
	def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst",
	[(set GR64:$dst, (int_x86_slwpcb))], IIC_LWP>,
	XOP, XOP9, VEX_W;

	multiclass lwpins_intr<RegisterClass RC> {
	def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
	"lwpins\t{$cntl, $src1, $src0\|$src0, $src1, $cntl}",
	[(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))], IIC_LWP>,
	XOP_4V, XOPA;
	let mayLoad = 1 in
	def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
	"lwpins\t{$cntl, $src1, $src0\|$src0, $src1, $cntl}",
	[(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))], IIC_LWP>,
	XOP_4V, XOPA;
	}

	let Defs = [EFLAGS] in {
	defm LWPINS32 : lwpins_intr<GR32>;
	defm LWPINS64 : lwpins_intr<GR64>, VEX_W;
	} // EFLAGS

	multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> {
	def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
	"lwpval\t{$cntl, $src1, $src0\|$src0, $src1, $cntl}",
	[(Int RC:$src0, GR32:$src1, imm:$cntl)], IIC_LWP>,
	XOP_4V, XOPA;
	let mayLoad = 1 in
	def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
	"lwpval\t{$cntl, $src1, $src0\|$src0, $src1, $cntl}",
	[(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)], IIC_LWP>,
	XOP_4V, XOPA;
	}

	defm LWPVAL32 : lwpval_intr<GR32, int_x86_lwpval32>;
	defm LWPVAL64 : lwpval_intr<GR64, int_x86_lwpval64>, VEX_W;

	} // HasLWP, SchedRW

	//===----------------------------------------------------------------------===//
	// MONITORX/MWAITX Instructions
	//
	let SchedRW = [ WriteSystem ] in {
	let usesCustomInserter = 1 in {
	def MONITORX : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
	[(int_x86_monitorx addr:$src1, GR32:$src2, GR32:$src3)]>,
	Requires<[ HasMWAITX ]>;
	}

	let Uses = [ EAX, ECX, EDX ] in {
	def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [], IIC_SSE_MONITORX>,
	TB, Requires<[ HasMWAITX ]>;
	}

	let Uses = [ ECX, EAX, EBX ] in {
	def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx",
	[(int_x86_mwaitx ECX, EAX, EBX)], IIC_SSE_MWAITX>,
	TB, Requires<[ HasMWAITX ]>;
	}
	} // SchedRW

	def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx\|ebx, ecx, eax}", (MWAITXrrr)>,
	Requires<[ Not64BitMode ]>;
	def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx\|rbx, rcx, rax}", (MWAITXrrr)>,
	Requires<[ In64BitMode ]>;

	def : InstAlias<"monitorx\t{%eax, %ecx, %edx\|edx, ecx, eax}", (MONITORXrrr)>,
	Requires<[ Not64BitMode ]>;
	def : InstAlias<"monitorx\t{%rax, %rcx, %rdx\|rdx, rcx, rax}", (MONITORXrrr)>,
	Requires<[ In64BitMode ]>;

	//===----------------------------------------------------------------------===//
	// CLZERO Instruction
	//
	let SchedRW = [WriteSystem] in {
	let Uses = [EAX] in
	def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", [], IIC_SSE_CLZERO>,
	TB, Requires<[HasCLZERO]>;

	let usesCustomInserter = 1 in {
	def CLZERO : PseudoI<(outs), (ins i32mem:$src1),
	[(int_x86_clzero addr:$src1)]>, Requires<[HasCLZERO]>;
	}
	} // SchedRW

	def : InstAlias<"clzero\t{%eax\|eax}", (CLZEROr)>, Requires<[Not64BitMode]>;
	def : InstAlias<"clzero\t{%rax\|rax}", (CLZEROr)>, Requires<[In64BitMode]>;

	//===----------------------------------------------------------------------===//
	// Pattern fragments to auto generate TBM instructions.
	//===----------------------------------------------------------------------===//

	let Predicates = [HasTBM] in {
	// FIXME: patterns for the load versions are not implemented
	def : Pat<(and GR32:$src, (add GR32:$src, 1)),
	(BLCFILL32rr GR32:$src)>;
	def : Pat<(and GR64:$src, (add GR64:$src, 1)),
	(BLCFILL64rr GR64:$src)>;

	def : Pat<(or GR32:$src, (not (add GR32:$src, 1))),
	(BLCI32rr GR32:$src)>;
	def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
	(BLCI64rr GR64:$src)>;

	// Extra patterns because opt can optimize the above patterns to this.
	def : Pat<(or GR32:$src, (sub -2, GR32:$src)),
	(BLCI32rr GR32:$src)>;
	def : Pat<(or GR64:$src, (sub -2, GR64:$src)),
	(BLCI64rr GR64:$src)>;

	def : Pat<(and (not GR32:$src), (add GR32:$src, 1)),
	(BLCIC32rr GR32:$src)>;
	def : Pat<(and (not GR64:$src), (add GR64:$src, 1)),
	(BLCIC64rr GR64:$src)>;

	def : Pat<(xor GR32:$src, (add GR32:$src, 1)),
	(BLCMSK32rr GR32:$src)>;
	def : Pat<(xor GR64:$src, (add GR64:$src, 1)),
	(BLCMSK64rr GR64:$src)>;

	def : Pat<(or GR32:$src, (add GR32:$src, 1)),
	(BLCS32rr GR32:$src)>;
	def : Pat<(or GR64:$src, (add GR64:$src, 1)),
	(BLCS64rr GR64:$src)>;

	def : Pat<(or GR32:$src, (add GR32:$src, -1)),
	(BLSFILL32rr GR32:$src)>;
	def : Pat<(or GR64:$src, (add GR64:$src, -1)),
	(BLSFILL64rr GR64:$src)>;

	def : Pat<(or (not GR32:$src), (add GR32:$src, -1)),
	(BLSIC32rr GR32:$src)>;
	def : Pat<(or (not GR64:$src), (add GR64:$src, -1)),
	(BLSIC64rr GR64:$src)>;

	def : Pat<(or (not GR32:$src), (add GR32:$src, 1)),
	(T1MSKC32rr GR32:$src)>;
	def : Pat<(or (not GR64:$src), (add GR64:$src, 1)),
	(T1MSKC64rr GR64:$src)>;

	def : Pat<(and (not GR32:$src), (add GR32:$src, -1)),
	(TZMSK32rr GR32:$src)>;
	def : Pat<(and (not GR64:$src), (add GR64:$src, -1)),
	(TZMSK64rr GR64:$src)>;
	} // HasTBM

	//===----------------------------------------------------------------------===//
	// Memory Instructions
	//

	let Predicates = [HasCLFLUSHOPT], SchedRW = [WriteLoad] in
	def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
	"clflushopt\t$src", [(int_x86_clflushopt addr:$src)],
	IIC_SSE_PREFETCH>, PD;

	let Predicates = [HasCLWB], SchedRW = [WriteLoad] in
	def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src",
	[(int_x86_clwb addr:$src)], IIC_SSE_PREFETCH>, PD;

	//===----------------------------------------------------------------------===//
	// Subsystems.
	//===----------------------------------------------------------------------===//

	include "X86InstrArithmetic.td"
	include "X86InstrCMovSetCC.td"
	include "X86InstrExtension.td"
	include "X86InstrControl.td"
	include "X86InstrShiftRotate.td"

	// X87 Floating Point Stack.
	include "X86InstrFPStack.td"

	// SIMD support (SSE, MMX and AVX)
	include "X86InstrFragmentsSIMD.td"

	// FMA - Fused Multiply-Add support (requires FMA)
	include "X86InstrFMA.td"

	// XOP
	include "X86InstrXOP.td"

	// SSE, MMX and 3DNow! vector support.
	include "X86InstrSSE.td"
	include "X86InstrAVX512.td"
	include "X86InstrMMX.td"
	include "X86Instr3DNow.td"

	// MPX instructions
	include "X86InstrMPX.td"

	include "X86InstrVMX.td"
	include "X86InstrSVM.td"

	include "X86InstrTSX.td"
	include "X86InstrSGX.td"

	// System instructions.
	include "X86InstrSystem.td"

	// Compiler Pseudo Instructions and Pat Patterns
	include "X86InstrCompiler.td"
	include "X86InstrVecCompiler.td"

	//===----------------------------------------------------------------------===//
	// Assembler Mnemonic Aliases
	//===----------------------------------------------------------------------===//

	def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>;

	def : MnemonicAlias<"cbw", "cbtw", "att">;
	def : MnemonicAlias<"cwde", "cwtl", "att">;
	def : MnemonicAlias<"cwd", "cwtd", "att">;
	def : MnemonicAlias<"cdq", "cltd", "att">;
	def : MnemonicAlias<"cdqe", "cltq", "att">;
	def : MnemonicAlias<"cqo", "cqto", "att">;

	// In 64-bit mode lret maps to lretl; it is not ambiguous with lretq.
	def : MnemonicAlias<"lret", "lretw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>;

	def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>;
	def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;

	def : MnemonicAlias<"loopz", "loope">;
	def : MnemonicAlias<"loopnz", "loopne">;

	def : MnemonicAlias<"pop", "popw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"popf", "popfw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"popf", "popfq", "intel">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"popfd", "popfl", "att">;

	// FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in
	// all modes. However: "push (addr)" and "push $42" should default to
	// pushl/pushq depending on the current mode. Similar for "pop %bx"
	def : MnemonicAlias<"push", "pushw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"push", "pushl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"pushf", "pushfw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"pushf", "pushfq", "intel">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"pushfd", "pushfl", "att">;

	def : MnemonicAlias<"popad", "popal", "intel">, Requires<[Not64BitMode]>;
	def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>;
	def : MnemonicAlias<"popa", "popaw", "intel">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"pusha", "pushaw", "intel">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"popa", "popal", "intel">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"pusha", "pushal", "intel">, Requires<[In32BitMode]>;

	def : MnemonicAlias<"popa", "popaw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"pusha", "pushaw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"popa", "popal", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"pusha", "pushal", "att">, Requires<[In32BitMode]>;

	def : MnemonicAlias<"repe", "rep">;
	def : MnemonicAlias<"repz", "rep">;
	def : MnemonicAlias<"repnz", "repne">;

	def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>;

	// Apply 'ret' behavior to 'retn'
	def : MnemonicAlias<"retn", "retw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"retn", "retl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"retn", "retq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"retn", "ret", "intel">;

	def : MnemonicAlias<"sal", "shl", "intel">;
	def : MnemonicAlias<"salb", "shlb", "att">;
	def : MnemonicAlias<"salw", "shlw", "att">;
	def : MnemonicAlias<"sall", "shll", "att">;
	def : MnemonicAlias<"salq", "shlq", "att">;

	def : MnemonicAlias<"smovb", "movsb", "att">;
	def : MnemonicAlias<"smovw", "movsw", "att">;
	def : MnemonicAlias<"smovl", "movsl", "att">;
	def : MnemonicAlias<"smovq", "movsq", "att">;

	def : MnemonicAlias<"ud2a", "ud2", "att">;
	def : MnemonicAlias<"verrw", "verr", "att">;

	// MS recognizes 'xacquire'/'xrelease' as 'acquire'/'release'
	def : MnemonicAlias<"acquire", "xacquire", "intel">;
	def : MnemonicAlias<"release", "xrelease", "intel">;

	// System instruction aliases.
	def : MnemonicAlias<"iret", "iretw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"iret", "iretl", "att">, Requires<[Not16BitMode]>;
	def : MnemonicAlias<"sysret", "sysretl", "att">;
	def : MnemonicAlias<"sysexit", "sysexitl", "att">;

	def : MnemonicAlias<"lgdt", "lgdtw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"lgdt", "lgdtl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"lgdt", "lgdtq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"lidt", "lidtw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"lidt", "lidtl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"lidt", "lidtq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"sgdt", "sgdtw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"sgdt", "sgdtl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>;
	def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>;
	def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>;
	def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>;


	// Floating point stack aliases.
	def : MnemonicAlias<"fcmovz", "fcmove", "att">;
	def : MnemonicAlias<"fcmova", "fcmovnbe", "att">;
	def : MnemonicAlias<"fcmovnae", "fcmovb", "att">;
	def : MnemonicAlias<"fcmovna", "fcmovbe", "att">;
	def : MnemonicAlias<"fcmovae", "fcmovnb", "att">;
	def : MnemonicAlias<"fcomip", "fcompi">;
	def : MnemonicAlias<"fildq", "fildll", "att">;
	def : MnemonicAlias<"fistpq", "fistpll", "att">;
	def : MnemonicAlias<"fisttpq", "fisttpll", "att">;
	def : MnemonicAlias<"fldcww", "fldcw", "att">;
	def : MnemonicAlias<"fnstcww", "fnstcw", "att">;
	def : MnemonicAlias<"fnstsww", "fnstsw", "att">;
	def : MnemonicAlias<"fucomip", "fucompi">;
	def : MnemonicAlias<"fwait", "wait">;

	def : MnemonicAlias<"fxsaveq", "fxsave64", "att">;
	def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">;
	def : MnemonicAlias<"xsaveq", "xsave64", "att">;
	def : MnemonicAlias<"xrstorq", "xrstor64", "att">;
	def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">;
	def : MnemonicAlias<"xrstorsq", "xrstors64", "att">;
	def : MnemonicAlias<"xsavecq", "xsavec64", "att">;
	def : MnemonicAlias<"xsavesq", "xsaves64", "att">;

	class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
	string VariantName>
	: MnemonicAlias<!strconcat(Prefix, OldCond, Suffix),
	!strconcat(Prefix, NewCond, Suffix), VariantName>;

	/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of
	/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for
	/// example "setz" -> "sete".
	multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix,
	string V = ""> {
	def C : CondCodeAlias<Prefix, Suffix, "c", "b", V>; // setc -> setb
	def Z : CondCodeAlias<Prefix, Suffix, "z" , "e", V>; // setz -> sete
	def NA : CondCodeAlias<Prefix, Suffix, "na", "be", V>; // setna -> setbe
	def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae", V>; // setnb -> setae
	def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae", V>; // setnc -> setae
	def NG : CondCodeAlias<Prefix, Suffix, "ng", "le", V>; // setng -> setle
	def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge", V>; // setnl -> setge
	def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne", V>; // setnz -> setne
	def PE : CondCodeAlias<Prefix, Suffix, "pe", "p", V>; // setpe -> setp
	def PO : CondCodeAlias<Prefix, Suffix, "po", "np", V>; // setpo -> setnp

	def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b", V>; // setnae -> setb
	def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a", V>; // setnbe -> seta
	def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l", V>; // setnge -> setl
	def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g", V>; // setnle -> setg
	}

	// Aliases for set<CC>
	defm : IntegerCondCodeMnemonicAlias<"set", "">;
	// Aliases for j<CC>
	defm : IntegerCondCodeMnemonicAlias<"j", "">;
	// Aliases for cmov<CC>{w,l,q}
	defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">;
	defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">;
	defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">;
	// No size suffix for intel-style asm.
	defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;


	//===----------------------------------------------------------------------===//
	// Assembler Instruction Aliases
	//===----------------------------------------------------------------------===//

	// aad/aam default to base 10 if no operand is specified.
	def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>;
	def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;

	// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
	// Likewise for btc/btr/bts.
	def : InstAlias<"bt\t{$imm, $mem\|$mem, $imm}",
	(BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
	def : InstAlias<"btc\t{$imm, $mem\|$mem, $imm}",
	(BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
	def : InstAlias<"btr\t{$imm, $mem\|$mem, $imm}",
	(BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
	def : InstAlias<"bts\t{$imm, $mem\|$mem, $imm}",
	(BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>;

	// clr aliases.
	def : InstAlias<"clrb\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>;
	def : InstAlias<"clrw\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
	def : InstAlias<"clrl\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
	def : InstAlias<"clrq\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;

	// lods aliases. Accept the destination being omitted because it's implicit
	// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
	// in the destination.
	def : InstAlias<"lodsb\t$src", (LODSB srcidx8:$src), 0>;
	def : InstAlias<"lodsw\t$src", (LODSW srcidx16:$src), 0>;
	def : InstAlias<"lods{l\|d}\t$src", (LODSL srcidx32:$src), 0>;
	def : InstAlias<"lodsq\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"lods\t{$src, %al\|al, $src}", (LODSB srcidx8:$src), 0>;
	def : InstAlias<"lods\t{$src, %ax\|ax, $src}", (LODSW srcidx16:$src), 0>;
	def : InstAlias<"lods\t{$src, %eax\|eax, $src}", (LODSL srcidx32:$src), 0>;
	def : InstAlias<"lods\t{$src, %rax\|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"lods\t$src", (LODSB srcidx8:$src), 0>;
	def : InstAlias<"lods\t$src", (LODSW srcidx16:$src), 0>;
	def : InstAlias<"lods\t$src", (LODSL srcidx32:$src), 0>;
	def : InstAlias<"lods\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;


	// stos aliases. Accept the source being omitted because it's implicit in
	// the mnemonic, or the mnemonic suffix being omitted because it's implicit
	// in the source.
	def : InstAlias<"stosb\t$dst", (STOSB dstidx8:$dst), 0>;
	def : InstAlias<"stosw\t$dst", (STOSW dstidx16:$dst), 0>;
	def : InstAlias<"stos{l\|d}\t$dst", (STOSL dstidx32:$dst), 0>;
	def : InstAlias<"stosq\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"stos\t{%al, $dst\|$dst, al}", (STOSB dstidx8:$dst), 0>;
	def : InstAlias<"stos\t{%ax, $dst\|$dst, ax}", (STOSW dstidx16:$dst), 0>;
	def : InstAlias<"stos\t{%eax, $dst\|$dst, eax}", (STOSL dstidx32:$dst), 0>;
	def : InstAlias<"stos\t{%rax, $dst\|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"stos\t$dst", (STOSB dstidx8:$dst), 0>;
	def : InstAlias<"stos\t$dst", (STOSW dstidx16:$dst), 0>;
	def : InstAlias<"stos\t$dst", (STOSL dstidx32:$dst), 0>;
	def : InstAlias<"stos\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;


	// scas aliases. Accept the destination being omitted because it's implicit
	// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
	// in the destination.
	def : InstAlias<"scasb\t$dst", (SCASB dstidx8:$dst), 0>;
	def : InstAlias<"scasw\t$dst", (SCASW dstidx16:$dst), 0>;
	def : InstAlias<"scas{l\|d}\t$dst", (SCASL dstidx32:$dst), 0>;
	def : InstAlias<"scasq\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"scas\t{$dst, %al\|al, $dst}", (SCASB dstidx8:$dst), 0>;
	def : InstAlias<"scas\t{$dst, %ax\|ax, $dst}", (SCASW dstidx16:$dst), 0>;
	def : InstAlias<"scas\t{$dst, %eax\|eax, $dst}", (SCASL dstidx32:$dst), 0>;
	def : InstAlias<"scas\t{$dst, %rax\|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"scas\t$dst", (SCASB dstidx8:$dst), 0>;
	def : InstAlias<"scas\t$dst", (SCASW dstidx16:$dst), 0>;
	def : InstAlias<"scas\t$dst", (SCASL dstidx32:$dst), 0>;
	def : InstAlias<"scas\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;

	// cmps aliases. Mnemonic suffix being omitted because it's implicit
	// in the destination.
	def : InstAlias<"cmps\t{$dst, $src\|$src, $dst}", (CMPSB dstidx8:$dst, srcidx8:$src), 0>;
	def : InstAlias<"cmps\t{$dst, $src\|$src, $dst}", (CMPSW dstidx16:$dst, srcidx16:$src), 0>;
	def : InstAlias<"cmps\t{$dst, $src\|$src, $dst}", (CMPSL dstidx32:$dst, srcidx32:$src), 0>;
	def : InstAlias<"cmps\t{$dst, $src\|$src, $dst}", (CMPSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>;

	// movs aliases. Mnemonic suffix being omitted because it's implicit
	// in the destination.
	def : InstAlias<"movs\t{$src, $dst\|$dst, $src}", (MOVSB dstidx8:$dst, srcidx8:$src), 0>;
	def : InstAlias<"movs\t{$src, $dst\|$dst, $src}", (MOVSW dstidx16:$dst, srcidx16:$src), 0>;
	def : InstAlias<"movs\t{$src, $dst\|$dst, $src}", (MOVSL dstidx32:$dst, srcidx32:$src), 0>;
	def : InstAlias<"movs\t{$src, $dst\|$dst, $src}", (MOVSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>;

	// div and idiv aliases for explicit A register.
	def : InstAlias<"div{b}\t{$src, %al\|al, $src}", (DIV8r GR8 :$src)>;
	def : InstAlias<"div{w}\t{$src, %ax\|ax, $src}", (DIV16r GR16:$src)>;
	def : InstAlias<"div{l}\t{$src, %eax\|eax, $src}", (DIV32r GR32:$src)>;
	def : InstAlias<"div{q}\t{$src, %rax\|rax, $src}", (DIV64r GR64:$src)>;
	def : InstAlias<"div{b}\t{$src, %al\|al, $src}", (DIV8m i8mem :$src)>;
	def : InstAlias<"div{w}\t{$src, %ax\|ax, $src}", (DIV16m i16mem:$src)>;
	def : InstAlias<"div{l}\t{$src, %eax\|eax, $src}", (DIV32m i32mem:$src)>;
	def : InstAlias<"div{q}\t{$src, %rax\|rax, $src}", (DIV64m i64mem:$src)>;
	def : InstAlias<"idiv{b}\t{$src, %al\|al, $src}", (IDIV8r GR8 :$src)>;
	def : InstAlias<"idiv{w}\t{$src, %ax\|ax, $src}", (IDIV16r GR16:$src)>;
	def : InstAlias<"idiv{l}\t{$src, %eax\|eax, $src}", (IDIV32r GR32:$src)>;
	def : InstAlias<"idiv{q}\t{$src, %rax\|rax, $src}", (IDIV64r GR64:$src)>;
	def : InstAlias<"idiv{b}\t{$src, %al\|al, $src}", (IDIV8m i8mem :$src)>;
	def : InstAlias<"idiv{w}\t{$src, %ax\|ax, $src}", (IDIV16m i16mem:$src)>;
	def : InstAlias<"idiv{l}\t{$src, %eax\|eax, $src}", (IDIV32m i32mem:$src)>;
	def : InstAlias<"idiv{q}\t{$src, %rax\|rax, $src}", (IDIV64m i64mem:$src)>;



	// Various unary fpstack operations default to operating on on ST1.
	// For example, "fxch" -> "fxch %st(1)"
	def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>;
	def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>;
	def : InstAlias<"fsub{\|r}p", (SUBR_FPrST0 ST1), 0>;
	def : InstAlias<"fsub{r\|}p", (SUB_FPrST0 ST1), 0>;
	def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>;
	def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>;
	def : InstAlias<"fdiv{\|r}p", (DIVR_FPrST0 ST1), 0>;
	def : InstAlias<"fdiv{r\|}p", (DIV_FPrST0 ST1), 0>;
	def : InstAlias<"fxch", (XCH_F ST1), 0>;
	def : InstAlias<"fcom", (COM_FST0r ST1), 0>;
	def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>;
	def : InstAlias<"fcomi", (COM_FIr ST1), 0>;
	def : InstAlias<"fcompi", (COM_FIPr ST1), 0>;
	def : InstAlias<"fucom", (UCOM_Fr ST1), 0>;
	def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>;
	def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>;
	def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>;

	// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op.
	// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate
	// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
	// gas.
	multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
	def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)\|st(0), $op}"),
	(Inst RST:$op), EmitAlias>;
	def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)\|st(0), st(0)}"),
	(Inst ST0), EmitAlias>;
	}

	defm : FpUnaryAlias<"fadd", ADD_FST0r>;
	defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>;
	defm : FpUnaryAlias<"fsub", SUB_FST0r>;
	defm : FpUnaryAlias<"fsub{\|r}p", SUBR_FPrST0>;
	defm : FpUnaryAlias<"fsubr", SUBR_FST0r>;
	defm : FpUnaryAlias<"fsub{r\|}p", SUB_FPrST0>;
	defm : FpUnaryAlias<"fmul", MUL_FST0r>;
	defm : FpUnaryAlias<"fmulp", MUL_FPrST0>;
	defm : FpUnaryAlias<"fdiv", DIV_FST0r>;
	defm : FpUnaryAlias<"fdiv{\|r}p", DIVR_FPrST0>;
	defm : FpUnaryAlias<"fdivr", DIVR_FST0r>;
	defm : FpUnaryAlias<"fdiv{r\|}p", DIV_FPrST0>;
	defm : FpUnaryAlias<"fcomi", COM_FIr, 0>;
	defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>;
	defm : FpUnaryAlias<"fcompi", COM_FIPr>;
	defm : FpUnaryAlias<"fucompi", UCOM_FIPr>;


	// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
	// commute. We also allow fdiv[r]p/fsubrp even though they don't commute,
	// solely because gas supports it.
	def : InstAlias<"faddp\t{%st(0), $op\|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>;
	def : InstAlias<"fmulp\t{%st(0), $op\|$op, st(0)}", (MUL_FPrST0 RST:$op)>;
	def : InstAlias<"fsub{\|r}p\t{%st(0), $op\|$op, st(0)}", (SUBR_FPrST0 RST:$op)>;
	def : InstAlias<"fsub{r\|}p\t{%st(0), $op\|$op, st(0)}", (SUB_FPrST0 RST:$op)>;
	def : InstAlias<"fdiv{\|r}p\t{%st(0), $op\|$op, st(0)}", (DIVR_FPrST0 RST:$op)>;
	def : InstAlias<"fdiv{r\|}p\t{%st(0), $op\|$op, st(0)}", (DIV_FPrST0 RST:$op)>;

	// We accept "fnstsw %eax" even though it only writes %ax.
	def : InstAlias<"fnstsw\t{%eax\|eax}", (FNSTSW16r)>;
	def : InstAlias<"fnstsw\t{%al\|al}" , (FNSTSW16r)>;
	def : InstAlias<"fnstsw" , (FNSTSW16r)>;

	// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but
	// this is compatible with what GAS does.
	def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
	def : InstAlias<"ljmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
	def : InstAlias<"lcall\t{*}$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
	def : InstAlias<"ljmp\t{*}$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
	def : InstAlias<"lcall\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
	def : InstAlias<"ljmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
	def : InstAlias<"lcall\t{*}$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
	def : InstAlias<"ljmp\t{*}$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;

	def : InstAlias<"call\t{*}$dst", (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"jmp\t{*}$dst", (JMP64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"call\t{*}$dst", (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
	def : InstAlias<"jmp\t{*}$dst", (JMP32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
	def : InstAlias<"call\t{*}$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
	def : InstAlias<"jmp\t{*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>;


	// "imul <imm>, B" is an alias for "imul <imm>, B, B".
	def : InstAlias<"imul{w}\t{$imm, $r\|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>;
	def : InstAlias<"imul{w}\t{$imm, $r\|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
	def : InstAlias<"imul{l}\t{$imm, $r\|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>;
	def : InstAlias<"imul{l}\t{$imm, $r\|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
	def : InstAlias<"imul{q}\t{$imm, $r\|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
	def : InstAlias<"imul{q}\t{$imm, $r\|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;

	// ins aliases. Accept the mnemonic suffix being omitted because it's implicit
	// in the destination.
	def : InstAlias<"ins\t{%dx, $dst\|$dst, dx}", (INSB dstidx8:$dst), 0>;
	def : InstAlias<"ins\t{%dx, $dst\|$dst, dx}", (INSW dstidx16:$dst), 0>;
	def : InstAlias<"ins\t{%dx, $dst\|$dst, dx}", (INSL dstidx32:$dst), 0>;

	// outs aliases. Accept the mnemonic suffix being omitted because it's implicit
	// in the source.
	def : InstAlias<"outs\t{$src, %dx\|dx, $src}", (OUTSB srcidx8:$src), 0>;
	def : InstAlias<"outs\t{$src, %dx\|dx, $src}", (OUTSW srcidx16:$src), 0>;
	def : InstAlias<"outs\t{$src, %dx\|dx, $src}", (OUTSL srcidx32:$src), 0>;

	// inb %dx -> inb %al, %dx
	def : InstAlias<"inb\t{%dx\|dx}", (IN8rr), 0>;
	def : InstAlias<"inw\t{%dx\|dx}", (IN16rr), 0>;
	def : InstAlias<"inl\t{%dx\|dx}", (IN32rr), 0>;
	def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>;
	def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>;
	def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>;


	// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp
	def : InstAlias<"call\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
	def : InstAlias<"jmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
	def : InstAlias<"call\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>;
	def : InstAlias<"jmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>;
	def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
	def : InstAlias<"jmpw\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
	def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
	def : InstAlias<"jmpl\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;

	// Force mov without a suffix with a segment and mem to prefer the 'l' form of
	// the move. All segment/mem forms are equivalent, this has the shortest
	// encoding.
	def : InstAlias<"mov\t{$mem, $seg\|$seg, $mem}", (MOV16sm SEGMENT_REG:$seg, i16mem:$mem), 0>;
	def : InstAlias<"mov\t{$seg, $mem\|$mem, $seg}", (MOV16ms i16mem:$mem, SEGMENT_REG:$seg), 0>;

	// Match 'movq <largeimm>, <reg>' as an alias for movabsq.
	def : InstAlias<"mov{q}\t{$imm, $reg\|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;

	// Match 'movq GR64, MMX' as an alias for movd.
	def : InstAlias<"movq\t{$src, $dst\|$dst, $src}",
	(MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
	def : InstAlias<"movq\t{$src, $dst\|$dst, $src}",
	(MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;

	// movsx aliases
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
	def : InstAlias<"movsx\t{$src, $dst\|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;

	// movzx aliases
	def : InstAlias<"movzx\t{$src, $dst\|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
	def : InstAlias<"movzx\t{$src, $dst\|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
	def : InstAlias<"movzx\t{$src, $dst\|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
	def : InstAlias<"movzx\t{$src, $dst\|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
	def : InstAlias<"movzx\t{$src, $dst\|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0>;
	def : InstAlias<"movzx\t{$src, $dst\|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0>;
	// Note: No GR32->GR64 movzx form.

	// outb %dx -> outb %al, %dx
	def : InstAlias<"outb\t{%dx\|dx}", (OUT8rr), 0>;
	def : InstAlias<"outw\t{%dx\|dx}", (OUT16rr), 0>;
	def : InstAlias<"outl\t{%dx\|dx}", (OUT32rr), 0>;
	def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>;
	def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>;
	def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>;

	// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
	// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity
	// errors, since its encoding is the most compact.
	def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>;

	// shld/shrd op,op -> shld op, op, CL
	def : InstAlias<"shld{w}\t{$r2, $r1\|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
	def : InstAlias<"shld{l}\t{$r2, $r1\|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>;
	def : InstAlias<"shld{q}\t{$r2, $r1\|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>;
	def : InstAlias<"shrd{w}\t{$r2, $r1\|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>;
	def : InstAlias<"shrd{l}\t{$r2, $r1\|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>;
	def : InstAlias<"shrd{q}\t{$r2, $r1\|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>;

	def : InstAlias<"shld{w}\t{$reg, $mem\|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>;
	def : InstAlias<"shld{l}\t{$reg, $mem\|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>;
	def : InstAlias<"shld{q}\t{$reg, $mem\|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>;
	def : InstAlias<"shrd{w}\t{$reg, $mem\|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>;
	def : InstAlias<"shrd{l}\t{$reg, $mem\|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>;
	def : InstAlias<"shrd{q}\t{$reg, $mem\|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>;

	/* FIXME: This is disabled because the asm matcher is currently incapable of
	* matching a fixed immediate like $1.
	// "shl X, $1" is an alias for "shl X".
	multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> {
	def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>;
	def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
	(!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>;
	}

	defm : ShiftRotateByOneAlias<"rcl", "RCL">;
	defm : ShiftRotateByOneAlias<"rcr", "RCR">;
	defm : ShiftRotateByOneAlias<"rol", "ROL">;
	defm : ShiftRotateByOneAlias<"ror", "ROR">;
	FIXME */

	// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
	def : InstAlias<"test{b}\t{$mem, $val\|$val, $mem}",
	(TEST8mr i8mem :$mem, GR8 :$val), 0>;
	def : InstAlias<"test{w}\t{$mem, $val\|$val, $mem}",
	(TEST16mr i16mem:$mem, GR16:$val), 0>;
	def : InstAlias<"test{l}\t{$mem, $val\|$val, $mem}",
	(TEST32mr i32mem:$mem, GR32:$val), 0>;
	def : InstAlias<"test{q}\t{$mem, $val\|$val, $mem}",
	(TEST64mr i64mem:$mem, GR64:$val), 0>;

	// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
	def : InstAlias<"xchg{b}\t{$mem, $val\|$val, $mem}",
	(XCHG8rm GR8 :$val, i8mem :$mem), 0>;
	def : InstAlias<"xchg{w}\t{$mem, $val\|$val, $mem}",
	(XCHG16rm GR16:$val, i16mem:$mem), 0>;
	def : InstAlias<"xchg{l}\t{$mem, $val\|$val, $mem}",
	(XCHG32rm GR32:$val, i32mem:$mem), 0>;
	def : InstAlias<"xchg{q}\t{$mem, $val\|$val, $mem}",
	(XCHG64rm GR64:$val, i64mem:$mem), 0>;

	// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
	def : InstAlias<"xchg{w}\t{%ax, $src\|$src, ax}", (XCHG16ar GR16:$src), 0>;
	def : InstAlias<"xchg{l}\t{%eax, $src\|$src, eax}",
	(XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>;
	def : InstAlias<"xchg{l}\t{%eax, $src\|$src, eax}",
	(XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>;
	def : InstAlias<"xchg{q}\t{%rax, $src\|$src, rax}", (XCHG64ar GR64:$src), 0>;

	// These aliases exist to get the parser to prioritize matching 8-bit
	// immediate encodings over matching the implicit ax/eax/rax encodings. By
	// explicitly mentioning the A register here, these entries will be ordered
	// first due to the more explicit immediate type.
	def : InstAlias<"adc{w}\t{$imm, %ax\|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"add{w}\t{$imm, %ax\|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"and{w}\t{$imm, %ax\|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"cmp{w}\t{$imm, %ax\|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"or{w}\t{$imm, %ax\|ax, $imm}", (OR16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"sbb{w}\t{$imm, %ax\|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"sub{w}\t{$imm, %ax\|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>;
	def : InstAlias<"xor{w}\t{$imm, %ax\|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>;

	def : InstAlias<"adc{l}\t{$imm, %eax\|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"add{l}\t{$imm, %eax\|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"and{l}\t{$imm, %eax\|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"cmp{l}\t{$imm, %eax\|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"or{l}\t{$imm, %eax\|eax, $imm}", (OR32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"sbb{l}\t{$imm, %eax\|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"sub{l}\t{$imm, %eax\|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>;
	def : InstAlias<"xor{l}\t{$imm, %eax\|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>;

	def : InstAlias<"adc{q}\t{$imm, %rax\|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"add{q}\t{$imm, %rax\|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"and{q}\t{$imm, %rax\|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"cmp{q}\t{$imm, %rax\|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"or{q}\t{$imm, %rax\|rax, $imm}", (OR64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"sbb{q}\t{$imm, %rax\|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"sub{q}\t{$imm, %rax\|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>;
	def : InstAlias<"xor{q}\t{$imm, %rax\|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>;
	Index: head/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp (revision 328817)
	@@ -1,2024 +1,2032 @@
	//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains code to lower X86 MachineInstrs to their corresponding
	// MCInst records.
	//
	//===----------------------------------------------------------------------===//

	#include "InstPrinter/X86ATTInstPrinter.h"
	#include "InstPrinter/X86InstComments.h"
	#include "MCTargetDesc/X86BaseInfo.h"
	#include "MCTargetDesc/X86TargetStreamer.h"
	#include "Utils/X86ShuffleDecode.h"
	#include "X86AsmPrinter.h"
	#include "X86RegisterInfo.h"
	#include "X86ShuffleDecodeConstantPool.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/CodeGen/MachineConstantPool.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineModuleInfoImpls.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/StackMaps.h"
	#include "llvm/CodeGen/TargetLoweringObjectFile.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/Mangler.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCCodeEmitter.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCFixup.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/MC/MCInstBuilder.h"
	#include "llvm/MC/MCSection.h"
	#include "llvm/MC/MCSectionELF.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/MC/MCSymbolELF.h"

	using namespace llvm;

	namespace {

	/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
	class X86MCInstLower {
	MCContext &Ctx;
	const MachineFunction &MF;
	const TargetMachine &TM;
	const MCAsmInfo &MAI;
	X86AsmPrinter &AsmPrinter;
	public:
	X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);

	Optional<MCOperand> LowerMachineOperand(const MachineInstr *MI,
	const MachineOperand &MO) const;
	void Lower(const MachineInstr *MI, MCInst &OutMI) const;

	MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
	MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;

	private:
	MachineModuleInfoMachO &getMachOMMI() const;
	};

	} // end anonymous namespace

	// Emit a minimal sequence of nops spanning NumBytes bytes.
	static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
	const MCSubtargetInfo &STI);

	void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
	const MCSubtargetInfo &STI,
	MCCodeEmitter *CodeEmitter) {
	if (InShadow) {
	SmallString<256> Code;
	SmallVector<MCFixup, 4> Fixups;
	raw_svector_ostream VecOS(Code);
	CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI);
	CurrentShadowSize += Code.size();
	if (CurrentShadowSize >= RequiredShadowSize)
	InShadow = false; // The shadow is big enough. Stop counting.
	}
	}

	void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
	MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
	if (InShadow && CurrentShadowSize < RequiredShadowSize) {
	InShadow = false;
	EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
	MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
	}
	}

	void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
	OutStreamer->EmitInstruction(Inst, getSubtargetInfo(),
	EnablePrintSchedInfo &&
	!(Inst.getFlags() & X86::NO_SCHED_INFO));
	SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
	}

	X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
	X86AsmPrinter &asmprinter)
	: Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()),
	AsmPrinter(asmprinter) {}

	MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
	return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
	}


	/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
	/// operand to an MCSymbol.
	MCSymbol *X86MCInstLower::
	GetSymbolFromOperand(const MachineOperand &MO) const {
	const DataLayout &DL = MF.getDataLayout();
	assert((MO.isGlobal() \|\| MO.isSymbol() \|\| MO.isMBB()) && "Isn't a symbol reference");

	MCSymbol *Sym = nullptr;
	SmallString<128> Name;
	StringRef Suffix;

	switch (MO.getTargetFlags()) {
	case X86II::MO_DLLIMPORT:
	// Handle dllimport linkage.
	Name += "__imp_";
	break;
	case X86II::MO_DARWIN_NONLAZY:
	case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
	Suffix = "$non_lazy_ptr";
	break;
	}

	if (!Suffix.empty())
	Name += DL.getPrivateGlobalPrefix();

	if (MO.isGlobal()) {
	const GlobalValue *GV = MO.getGlobal();
	AsmPrinter.getNameWithPrefix(Name, GV);
	} else if (MO.isSymbol()) {
	Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
	} else if (MO.isMBB()) {
	assert(Suffix.empty());
	Sym = MO.getMBB()->getSymbol();
	}

	Name += Suffix;
	if (!Sym)
	Sym = Ctx.getOrCreateSymbol(Name);

	// If the target flags on the operand changes the name of the symbol, do that
	// before we return the symbol.
	switch (MO.getTargetFlags()) {
	default: break;
	case X86II::MO_DARWIN_NONLAZY:
	case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
	MachineModuleInfoImpl::StubValueTy &StubSym =
	getMachOMMI().getGVStubEntry(Sym);
	if (!StubSym.getPointer()) {
	assert(MO.isGlobal() && "Extern symbol not handled yet");
	StubSym =
	MachineModuleInfoImpl::
	StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
	!MO.getGlobal()->hasInternalLinkage());
	}
	break;
	}
	}

	return Sym;
	}

	MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
	MCSymbol *Sym) const {
	// FIXME: We would like an efficient form for this, so we don't have to do a
	// lot of extra uniquing.
	const MCExpr *Expr = nullptr;
	MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;

	switch (MO.getTargetFlags()) {
	default: llvm_unreachable("Unknown target flag on GV operand");
	case X86II::MO_NO_FLAG: // No flag.
	// These affect the name of the symbol, not any suffix.
	case X86II::MO_DARWIN_NONLAZY:
	case X86II::MO_DLLIMPORT:
	break;

	case X86II::MO_TLVP: RefKind = MCSymbolRefExpr::VK_TLVP; break;
	case X86II::MO_TLVP_PIC_BASE:
	Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
	// Subtract the pic base.
	Expr = MCBinaryExpr::createSub(Expr,
	MCSymbolRefExpr::create(MF.getPICBaseSymbol(),
	Ctx),
	Ctx);
	break;
	case X86II::MO_SECREL: RefKind = MCSymbolRefExpr::VK_SECREL; break;
	case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break;
	case X86II::MO_TLSLD: RefKind = MCSymbolRefExpr::VK_TLSLD; break;
	case X86II::MO_TLSLDM: RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
	case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
	case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
	case X86II::MO_TPOFF: RefKind = MCSymbolRefExpr::VK_TPOFF; break;
	case X86II::MO_DTPOFF: RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
	case X86II::MO_NTPOFF: RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
	case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
	case X86II::MO_GOTPCREL: RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
	case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break;
	case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
	case X86II::MO_PLT: RefKind = MCSymbolRefExpr::VK_PLT; break;
	case X86II::MO_ABS8: RefKind = MCSymbolRefExpr::VK_X86_ABS8; break;
	case X86II::MO_PIC_BASE_OFFSET:
	case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
	Expr = MCSymbolRefExpr::create(Sym, Ctx);
	// Subtract the pic base.
	Expr = MCBinaryExpr::createSub(Expr,
	MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx),
	Ctx);
	if (MO.isJTI()) {
	assert(MAI.doesSetDirectiveSuppressReloc());
	// If .set directive is supported, use it to reduce the number of
	// relocations the assembler will generate for differences between
	// local labels. This is only safe when the symbols are in the same
	// section so we are restricting it to jumptable references.
	MCSymbol *Label = Ctx.createTempSymbol();
	AsmPrinter.OutStreamer->EmitAssignment(Label, Expr);
	Expr = MCSymbolRefExpr::create(Label, Ctx);
	}
	break;
	}

	if (!Expr)
	Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);

	if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
	Expr = MCBinaryExpr::createAdd(Expr,
	MCConstantExpr::create(MO.getOffset(), Ctx),
	Ctx);
	return MCOperand::createExpr(Expr);
	}


	/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
	/// a short fixed-register form.
	static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
	unsigned ImmOp = Inst.getNumOperands() - 1;
	assert(Inst.getOperand(0).isReg() &&
	(Inst.getOperand(ImmOp).isImm() \|\| Inst.getOperand(ImmOp).isExpr()) &&
	((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
	Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) \|\|
	Inst.getNumOperands() == 2) && "Unexpected instruction!");

	// Check whether the destination register can be fixed.
	unsigned Reg = Inst.getOperand(0).getReg();
	if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
	return;

	// If so, rewrite the instruction.
	MCOperand Saved = Inst.getOperand(ImmOp);
	Inst = MCInst();
	Inst.setOpcode(Opcode);
	Inst.addOperand(Saved);
	}

	/// \brief If a movsx instruction has a shorter encoding for the used register
	/// simplify the instruction to use it instead.
	static void SimplifyMOVSX(MCInst &Inst) {
	unsigned NewOpcode = 0;
	unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg();
	switch (Inst.getOpcode()) {
	default:
	llvm_unreachable("Unexpected instruction!");
	case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw
	if (Op0 == X86::AX && Op1 == X86::AL)
	NewOpcode = X86::CBW;
	break;
	case X86::MOVSX32rr16: // movswl %ax, %eax --> cwtl
	if (Op0 == X86::EAX && Op1 == X86::AX)
	NewOpcode = X86::CWDE;
	break;
	case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq
	if (Op0 == X86::RAX && Op1 == X86::EAX)
	NewOpcode = X86::CDQE;
	break;
	}

	if (NewOpcode != 0) {
	Inst = MCInst();
	Inst.setOpcode(NewOpcode);
	}
	}

	/// \brief Simplify things like MOV32rm to MOV32o32a.
	static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
	unsigned Opcode) {
	// Don't make these simplifications in 64-bit mode; other assemblers don't
	// perform them because they make the code larger.
	if (Printer.getSubtarget().is64Bit())
	return;

	bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
	unsigned AddrBase = IsStore;
	unsigned RegOp = IsStore ? 0 : 5;
	unsigned AddrOp = AddrBase + 3;
	assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
	Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
	Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
	Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
	Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
	(Inst.getOperand(AddrOp).isExpr() \|\|
	Inst.getOperand(AddrOp).isImm()) &&
	"Unexpected instruction!");

	// Check whether the destination register can be fixed.
	unsigned Reg = Inst.getOperand(RegOp).getReg();
	if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
	return;

	// Check whether this is an absolute address.
	// FIXME: We know TLVP symbol refs aren't, but there should be a better way
	// to do this here.
	bool Absolute = true;
	if (Inst.getOperand(AddrOp).isExpr()) {
	const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr();
	if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE))
	if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
	Absolute = false;
	}

	if (Absolute &&
	(Inst.getOperand(AddrBase + X86::AddrBaseReg).getReg() != 0 \|\|
	Inst.getOperand(AddrBase + X86::AddrScaleAmt).getImm() != 1 \|\|
	Inst.getOperand(AddrBase + X86::AddrIndexReg).getReg() != 0))
	return;

	// If so, rewrite the instruction.
	MCOperand Saved = Inst.getOperand(AddrOp);
	MCOperand Seg = Inst.getOperand(AddrBase + X86::AddrSegmentReg);
	Inst = MCInst();
	Inst.setOpcode(Opcode);
	Inst.addOperand(Saved);
	Inst.addOperand(Seg);
	}

	static unsigned getRetOpcode(const X86Subtarget &Subtarget) {
	return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
	}

	Optional<MCOperand>
	X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
	const MachineOperand &MO) const {
	switch (MO.getType()) {
	default:
	MI->print(errs());
	llvm_unreachable("unknown operand type");
	case MachineOperand::MO_Register:
	// Ignore all implicit register operands.
	if (MO.isImplicit())
	return None;
	return MCOperand::createReg(MO.getReg());
	case MachineOperand::MO_Immediate:
	return MCOperand::createImm(MO.getImm());
	case MachineOperand::MO_MachineBasicBlock:
	case MachineOperand::MO_GlobalAddress:
	case MachineOperand::MO_ExternalSymbol:
	return LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
	case MachineOperand::MO_MCSymbol:
	return LowerSymbolOperand(MO, MO.getMCSymbol());
	case MachineOperand::MO_JumpTableIndex:
	return LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
	case MachineOperand::MO_ConstantPoolIndex:
	return LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
	case MachineOperand::MO_BlockAddress:
	return LowerSymbolOperand(
	MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
	case MachineOperand::MO_RegisterMask:
	// Ignore call clobbers.
	return None;
	}
	}

	void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
	OutMI.setOpcode(MI->getOpcode());

	for (const MachineOperand &MO : MI->operands())
	if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
	OutMI.addOperand(MaybeMCOp.getValue());

	// Handle a few special cases to eliminate operand modifiers.
	ReSimplify:
	switch (OutMI.getOpcode()) {
	case X86::LEA64_32r:
	case X86::LEA64r:
	case X86::LEA16r:
	case X86::LEA32r:
	// LEA should have a segment register, but it must be empty.
	assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands &&
	"Unexpected # of LEA operands");
	assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
	"LEA has segment specified!");
	break;

	// Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
	// if one of the registers is extended, but other isn't.
	case X86::VMOVZPQILo2PQIrr:
	case X86::VMOVAPDrr:
	case X86::VMOVAPDYrr:
	case X86::VMOVAPSrr:
	case X86::VMOVAPSYrr:
	case X86::VMOVDQArr:
	case X86::VMOVDQAYrr:
	case X86::VMOVDQUrr:
	case X86::VMOVDQUYrr:
	case X86::VMOVUPDrr:
	case X86::VMOVUPDYrr:
	case X86::VMOVUPSrr:
	case X86::VMOVUPSYrr: {
	if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
	X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break;
	case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
	case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
	case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
	case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
	case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
	case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
	case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
	case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
	case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
	case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
	case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
	case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
	}
	OutMI.setOpcode(NewOpc);
	}
	break;
	}
	case X86::VMOVSDrr:
	case X86::VMOVSSrr: {
	if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
	X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
	case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
	}
	OutMI.setOpcode(NewOpc);
	}
	break;
	}

	// TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
	// inputs modeled as normal uses instead of implicit uses. As such, truncate
	// off all but the first operand (the callee). FIXME: Change isel.
	case X86::TAILJMPr64:
	case X86::TAILJMPr64_REX:
	case X86::CALL64r:
	case X86::CALL64pcrel32: {
	unsigned Opcode = OutMI.getOpcode();
	MCOperand Saved = OutMI.getOperand(0);
	OutMI = MCInst();
	OutMI.setOpcode(Opcode);
	OutMI.addOperand(Saved);
	break;
	}

	case X86::EH_RETURN:
	case X86::EH_RETURN64: {
	OutMI = MCInst();
	OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
	break;
	}

	case X86::CLEANUPRET: {
	// Replace CATCHRET with the appropriate RET.
	OutMI = MCInst();
	OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
	break;
	}

	case X86::CATCHRET: {
	// Replace CATCHRET with the appropriate RET.
	const X86Subtarget &Subtarget = AsmPrinter.getSubtarget();
	unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	OutMI = MCInst();
	OutMI.setOpcode(getRetOpcode(Subtarget));
	OutMI.addOperand(MCOperand::createReg(ReturnReg));
	break;
	}

	// TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump instruction.
	{ unsigned Opcode;
	case X86::TAILJMPr: Opcode = X86::JMP32r; goto SetTailJmpOpcode;
	case X86::TAILJMPd:
	case X86::TAILJMPd64: Opcode = X86::JMP_1; goto SetTailJmpOpcode;
	case X86::TAILJMPd_CC:
	case X86::TAILJMPd64_CC:
	Opcode = X86::GetCondBranchFromCond(
	static_cast<X86::CondCode>(MI->getOperand(1).getImm()));
	goto SetTailJmpOpcode;

	SetTailJmpOpcode:
	MCOperand Saved = OutMI.getOperand(0);
	OutMI = MCInst();
	OutMI.setOpcode(Opcode);
	OutMI.addOperand(Saved);
	break;
	}

	case X86::DEC16r:
	case X86::DEC32r:
	case X86::INC16r:
	case X86::INC32r:
	// If we aren't in 64-bit mode we can use the 1-byte inc/dec instructions.
	if (!AsmPrinter.getSubtarget().is64Bit()) {
	unsigned Opcode;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::DEC16r: Opcode = X86::DEC16r_alt; break;
	case X86::DEC32r: Opcode = X86::DEC32r_alt; break;
	case X86::INC16r: Opcode = X86::INC16r_alt; break;
	case X86::INC32r: Opcode = X86::INC32r_alt; break;
	}
	OutMI.setOpcode(Opcode);
	}
	break;

	// These are pseudo-ops for OR to help with the OR->ADD transformation. We do
	// this with an ugly goto in case the resultant OR uses EAX and needs the
	// short form.
	case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
	case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
	case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
	case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
	case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
	case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
	case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
	case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
	case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;

	// Atomic load and store require a separate pseudo-inst because Acquire
	// implies mayStore and Release implies mayLoad; fix these to regular MOV
	// instructions here
	case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
	case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
	case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
	case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
	case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
	case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
	case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
	case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
	case X86::RELEASE_MOV8mi: OutMI.setOpcode(X86::MOV8mi); goto ReSimplify;
	case X86::RELEASE_MOV16mi: OutMI.setOpcode(X86::MOV16mi); goto ReSimplify;
	case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
	case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
	case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
	case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
	case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
	case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
	case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
	case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
	case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
	case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
	case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
	case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
	case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
	case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
	case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
	case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
	case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
	case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
	case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
	case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
	case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
	case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
	case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
	case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
	case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
	case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
	case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify;
	case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify;
	case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify;
	case X86::RELEASE_INC64m: OutMI.setOpcode(X86::INC64m); goto ReSimplify;
	case X86::RELEASE_DEC8m: OutMI.setOpcode(X86::DEC8m); goto ReSimplify;
	case X86::RELEASE_DEC16m: OutMI.setOpcode(X86::DEC16m); goto ReSimplify;
	case X86::RELEASE_DEC32m: OutMI.setOpcode(X86::DEC32m); goto ReSimplify;
	case X86::RELEASE_DEC64m: OutMI.setOpcode(X86::DEC64m); goto ReSimplify;

	// We don't currently select the correct instruction form for instructions
	// which have a short %eax, etc. form. Handle this by custom lowering, for
	// now.
	//
	// Note, we are currently not handling the following instructions:
	// MOV64ao8, MOV64o8a
	// XCHG16ar, XCHG32ar, XCHG64ar
	case X86::MOV8mr_NOREX:
	case X86::MOV8mr:
	case X86::MOV8rm_NOREX:
	case X86::MOV8rm:
	case X86::MOV16mr:
	case X86::MOV16rm:
	case X86::MOV32mr:
	case X86::MOV32rm: {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::MOV8mr_NOREX:
	case X86::MOV8mr: NewOpc = X86::MOV8o32a; break;
	case X86::MOV8rm_NOREX:
	case X86::MOV8rm: NewOpc = X86::MOV8ao32; break;
	case X86::MOV16mr: NewOpc = X86::MOV16o32a; break;
	case X86::MOV16rm: NewOpc = X86::MOV16ao32; break;
	case X86::MOV32mr: NewOpc = X86::MOV32o32a; break;
	case X86::MOV32rm: NewOpc = X86::MOV32ao32; break;
	}
	SimplifyShortMoveForm(AsmPrinter, OutMI, NewOpc);
	break;
	}

	case X86::ADC8ri: case X86::ADC16ri: case X86::ADC32ri: case X86::ADC64ri32:
	case X86::ADD8ri: case X86::ADD16ri: case X86::ADD32ri: case X86::ADD64ri32:
	case X86::AND8ri: case X86::AND16ri: case X86::AND32ri: case X86::AND64ri32:
	case X86::CMP8ri: case X86::CMP16ri: case X86::CMP32ri: case X86::CMP64ri32:
	case X86::OR8ri: case X86::OR16ri: case X86::OR32ri: case X86::OR64ri32:
	case X86::SBB8ri: case X86::SBB16ri: case X86::SBB32ri: case X86::SBB64ri32:
	case X86::SUB8ri: case X86::SUB16ri: case X86::SUB32ri: case X86::SUB64ri32:
	case X86::TEST8ri:case X86::TEST16ri:case X86::TEST32ri:case X86::TEST64ri32:
	case X86::XOR8ri: case X86::XOR16ri: case X86::XOR32ri: case X86::XOR64ri32: {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::ADC8ri: NewOpc = X86::ADC8i8; break;
	case X86::ADC16ri: NewOpc = X86::ADC16i16; break;
	case X86::ADC32ri: NewOpc = X86::ADC32i32; break;
	case X86::ADC64ri32: NewOpc = X86::ADC64i32; break;
	case X86::ADD8ri: NewOpc = X86::ADD8i8; break;
	case X86::ADD16ri: NewOpc = X86::ADD16i16; break;
	case X86::ADD32ri: NewOpc = X86::ADD32i32; break;
	case X86::ADD64ri32: NewOpc = X86::ADD64i32; break;
	case X86::AND8ri: NewOpc = X86::AND8i8; break;
	case X86::AND16ri: NewOpc = X86::AND16i16; break;
	case X86::AND32ri: NewOpc = X86::AND32i32; break;
	case X86::AND64ri32: NewOpc = X86::AND64i32; break;
	case X86::CMP8ri: NewOpc = X86::CMP8i8; break;
	case X86::CMP16ri: NewOpc = X86::CMP16i16; break;
	case X86::CMP32ri: NewOpc = X86::CMP32i32; break;
	case X86::CMP64ri32: NewOpc = X86::CMP64i32; break;
	case X86::OR8ri: NewOpc = X86::OR8i8; break;
	case X86::OR16ri: NewOpc = X86::OR16i16; break;
	case X86::OR32ri: NewOpc = X86::OR32i32; break;
	case X86::OR64ri32: NewOpc = X86::OR64i32; break;
	case X86::SBB8ri: NewOpc = X86::SBB8i8; break;
	case X86::SBB16ri: NewOpc = X86::SBB16i16; break;
	case X86::SBB32ri: NewOpc = X86::SBB32i32; break;
	case X86::SBB64ri32: NewOpc = X86::SBB64i32; break;
	case X86::SUB8ri: NewOpc = X86::SUB8i8; break;
	case X86::SUB16ri: NewOpc = X86::SUB16i16; break;
	case X86::SUB32ri: NewOpc = X86::SUB32i32; break;
	case X86::SUB64ri32: NewOpc = X86::SUB64i32; break;
	case X86::TEST8ri: NewOpc = X86::TEST8i8; break;
	case X86::TEST16ri: NewOpc = X86::TEST16i16; break;
	case X86::TEST32ri: NewOpc = X86::TEST32i32; break;
	case X86::TEST64ri32: NewOpc = X86::TEST64i32; break;
	case X86::XOR8ri: NewOpc = X86::XOR8i8; break;
	case X86::XOR16ri: NewOpc = X86::XOR16i16; break;
	case X86::XOR32ri: NewOpc = X86::XOR32i32; break;
	case X86::XOR64ri32: NewOpc = X86::XOR64i32; break;
	}
	SimplifyShortImmForm(OutMI, NewOpc);
	break;
	}

	// Try to shrink some forms of movsx.
	case X86::MOVSX16rr8:
	case X86::MOVSX32rr16:
	case X86::MOVSX64rr32:
	SimplifyMOVSX(OutMI);
	break;
	}
	}

	void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
	const MachineInstr &MI) {

	bool is64Bits = MI.getOpcode() == X86::TLS_addr64 \|\|
	MI.getOpcode() == X86::TLS_base_addr64;

	bool needsPadding = MI.getOpcode() == X86::TLS_addr64;

	MCContext &context = OutStreamer->getContext();

	if (needsPadding)
	EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));

	MCSymbolRefExpr::VariantKind SRVK;
	switch (MI.getOpcode()) {
	case X86::TLS_addr32:
	case X86::TLS_addr64:
	SRVK = MCSymbolRefExpr::VK_TLSGD;
	break;
	case X86::TLS_base_addr32:
	SRVK = MCSymbolRefExpr::VK_TLSLDM;
	break;
	case X86::TLS_base_addr64:
	SRVK = MCSymbolRefExpr::VK_TLSLD;
	break;
	default:
	llvm_unreachable("unexpected opcode");
	}

	MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
	const MCSymbolRefExpr *symRef = MCSymbolRefExpr::create(sym, SRVK, context);

	MCInst LEA;
	if (is64Bits) {
	LEA.setOpcode(X86::LEA64r);
	LEA.addOperand(MCOperand::createReg(X86::RDI)); // dest
	LEA.addOperand(MCOperand::createReg(X86::RIP)); // base
	LEA.addOperand(MCOperand::createImm(1)); // scale
	LEA.addOperand(MCOperand::createReg(0)); // index
	LEA.addOperand(MCOperand::createExpr(symRef)); // disp
	LEA.addOperand(MCOperand::createReg(0)); // seg
	} else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
	LEA.setOpcode(X86::LEA32r);
	LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
	LEA.addOperand(MCOperand::createReg(X86::EBX)); // base
	LEA.addOperand(MCOperand::createImm(1)); // scale
	LEA.addOperand(MCOperand::createReg(0)); // index
	LEA.addOperand(MCOperand::createExpr(symRef)); // disp
	LEA.addOperand(MCOperand::createReg(0)); // seg
	} else {
	LEA.setOpcode(X86::LEA32r);
	LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
	LEA.addOperand(MCOperand::createReg(0)); // base
	LEA.addOperand(MCOperand::createImm(1)); // scale
	LEA.addOperand(MCOperand::createReg(X86::EBX)); // index
	LEA.addOperand(MCOperand::createExpr(symRef)); // disp
	LEA.addOperand(MCOperand::createReg(0)); // seg
	}
	EmitAndCountInstruction(LEA);

	if (needsPadding) {
	EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
	EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
	EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
	}

	StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
	MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name);
	const MCSymbolRefExpr *tlsRef =
	MCSymbolRefExpr::create(tlsGetAddr,
	MCSymbolRefExpr::VK_PLT,
	context);

	EmitAndCountInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
	: X86::CALLpcrel32)
	.addExpr(tlsRef));
	}

	/// \brief Emit the largest nop instruction smaller than or equal to \p NumBytes
	/// bytes. Return the size of nop emitted.
	static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
	const MCSubtargetInfo &STI) {
	// This works only for 64bit. For 32bit we have to do additional checking if
	// the CPU supports multi-byte nops.
	assert(Is64Bit && "EmitNops only supports X86-64");

	unsigned NopSize;
	unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
	Opc = IndexReg = Displacement = SegmentReg = 0;
	BaseReg = X86::RAX;
	ScaleVal = 1;
	switch (NumBytes) {
	case 0: llvm_unreachable("Zero nops?"); break;
	case 1: NopSize = 1; Opc = X86::NOOP; break;
	case 2: NopSize = 2; Opc = X86::XCHG16ar; break;
	case 3: NopSize = 3; Opc = X86::NOOPL; break;
	case 4: NopSize = 4; Opc = X86::NOOPL; Displacement = 8; break;
	case 5: NopSize = 5; Opc = X86::NOOPL; Displacement = 8;
	IndexReg = X86::RAX; break;
	case 6: NopSize = 6; Opc = X86::NOOPW; Displacement = 8;
	IndexReg = X86::RAX; break;
	case 7: NopSize = 7; Opc = X86::NOOPL; Displacement = 512; break;
	case 8: NopSize = 8; Opc = X86::NOOPL; Displacement = 512;
	IndexReg = X86::RAX; break;
	case 9: NopSize = 9; Opc = X86::NOOPW; Displacement = 512;
	IndexReg = X86::RAX; break;
	default: NopSize = 10; Opc = X86::NOOPW; Displacement = 512;
	IndexReg = X86::RAX; SegmentReg = X86::CS; break;
	}

	unsigned NumPrefixes = std::min(NumBytes - NopSize, 5U);
	NopSize += NumPrefixes;
	for (unsigned i = 0; i != NumPrefixes; ++i)
	OS.EmitBytes("\x66");

	switch (Opc) {
	default:
	llvm_unreachable("Unexpected opcode");
	break;
	case X86::NOOP:
	OS.EmitInstruction(MCInstBuilder(Opc), STI);
	break;
	case X86::XCHG16ar:
	OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI);
	break;
	case X86::NOOPL:
	case X86::NOOPW:
	OS.EmitInstruction(MCInstBuilder(Opc)
	.addReg(BaseReg)
	.addImm(ScaleVal)
	.addReg(IndexReg)
	.addImm(Displacement)
	.addReg(SegmentReg),
	STI);
	break;
	}
	assert(NopSize <= NumBytes && "We overemitted?");
	return NopSize;
	}

	/// \brief Emit the optimal amount of multi-byte nops on X86.
	static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
	const MCSubtargetInfo &STI) {
	unsigned NopsToEmit = NumBytes;
	(void)NopsToEmit;
	while (NumBytes) {
	NumBytes -= EmitNop(OS, NumBytes, Is64Bit, STI);
	assert(NopsToEmit >= NumBytes && "Emitted more than I asked for!");
	}
	}

	void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");

	StatepointOpers SOpers(&MI);
	if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
	EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
	getSubtargetInfo());
	} else {
	// Lower call target and choose correct opcode
	const MachineOperand &CallTarget = SOpers.getCallTarget();
	MCOperand CallTargetMCOp;
	unsigned CallOpcode;
	switch (CallTarget.getType()) {
	case MachineOperand::MO_GlobalAddress:
	case MachineOperand::MO_ExternalSymbol:
	CallTargetMCOp = MCIL.LowerSymbolOperand(
	CallTarget, MCIL.GetSymbolFromOperand(CallTarget));
	CallOpcode = X86::CALL64pcrel32;
	// Currently, we only support relative addressing with statepoints.
	// Otherwise, we'll need a scratch register to hold the target
	// address. You'll fail asserts during load & relocation if this
	// symbol is to far away. (TODO: support non-relative addressing)
	break;
	case MachineOperand::MO_Immediate:
	CallTargetMCOp = MCOperand::createImm(CallTarget.getImm());
	CallOpcode = X86::CALL64pcrel32;
	// Currently, we only support relative addressing with statepoints.
	// Otherwise, we'll need a scratch register to hold the target
	// immediate. You'll fail asserts during load & relocation if this
	// address is to far away. (TODO: support non-relative addressing)
	break;
	case MachineOperand::MO_Register:
	+ // FIXME: Add retpoline support and remove this.
	+ if (Subtarget->useRetpoline())
	+ report_fatal_error("Lowering register statepoints with retpoline not "
	+ "yet implemented.");
	CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
	CallOpcode = X86::CALL64r;
	break;
	default:
	llvm_unreachable("Unsupported operand type in statepoint call target");
	break;
	}

	// Emit call
	MCInst CallInst;
	CallInst.setOpcode(CallOpcode);
	CallInst.addOperand(CallTargetMCOp);
	OutStreamer->EmitInstruction(CallInst, getSubtargetInfo());
	}

	// Record our statepoint node in the same section used by STACKMAP
	// and PATCHPOINT
	SM.recordStatepoint(MI);
	}

	void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
	X86MCInstLower &MCIL) {
	// FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
	// <opcode>, <operands>

	unsigned DefRegister = FaultingMI.getOperand(0).getReg();
	FaultMaps::FaultKind FK =
	static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
	MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol();
	unsigned Opcode = FaultingMI.getOperand(3).getImm();
	unsigned OperandsBeginIdx = 4;

	assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
	FM.recordFaultingOp(FK, HandlerLabel);

	MCInst MI;
	MI.setOpcode(Opcode);

	if (DefRegister != X86::NoRegister)
	MI.addOperand(MCOperand::createReg(DefRegister));

	for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx,
	E = FaultingMI.operands_end();
	I != E; ++I)
	if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I))
	MI.addOperand(MaybeOperand.getValue());

	OutStreamer->EmitInstruction(MI, getSubtargetInfo());
	}

	void X86AsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	bool Is64Bits = Subtarget->is64Bit();
	MCContext &Ctx = OutStreamer->getContext();
	MCSymbol *fentry = Ctx.getOrCreateSymbol("__fentry__");
	const MCSymbolRefExpr *Op =
	MCSymbolRefExpr::create(fentry, MCSymbolRefExpr::VK_None, Ctx);

	EmitAndCountInstruction(
	MCInstBuilder(Is64Bits ? X86::CALL64pcrel32 : X86::CALLpcrel32)
	.addExpr(Op));
	}

	void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	// PATCHABLE_OP minsize, opcode, operands

	unsigned MinSize = MI.getOperand(0).getImm();
	unsigned Opcode = MI.getOperand(1).getImm();

	MCInst MCI;
	MCI.setOpcode(Opcode);
	for (auto &MO : make_range(MI.operands_begin() + 2, MI.operands_end()))
	if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
	MCI.addOperand(MaybeOperand.getValue());

	SmallString<256> Code;
	SmallVector<MCFixup, 4> Fixups;
	raw_svector_ostream VecOS(Code);
	CodeEmitter->encodeInstruction(MCI, VecOS, Fixups, getSubtargetInfo());

	if (Code.size() < MinSize) {
	if (MinSize == 2 && Opcode == X86::PUSH64r) {
	// This is an optimization that lets us get away without emitting a nop in
	// many cases.
	//
	// NB! In some cases the encoding for PUSH64r (e.g. PUSH64r %r9) takes two
	// bytes too, so the check on MinSize is important.
	MCI.setOpcode(X86::PUSH64rmr);
	} else {
	unsigned NopSize = EmitNop(*OutStreamer, MinSize, Subtarget->is64Bit(),
	getSubtargetInfo());
	assert(NopSize == MinSize && "Could not implement MinSize!");
	(void) NopSize;
	}
	}

	OutStreamer->EmitInstruction(MCI, getSubtargetInfo());
	}

	// Lower a stackmap of the form:
	// <id>, <shadowBytes>, ...
	void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
	SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
	SM.recordStackMap(MI);
	unsigned NumShadowBytes = MI.getOperand(1).getImm();
	SMShadowTracker.reset(NumShadowBytes);
	}

	// Lower a patchpoint of the form:
	// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
	void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64");

	SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());

	SM.recordPatchPoint(MI);

	PatchPointOpers opers(&MI);
	unsigned ScratchIdx = opers.getNextScratchIdx();
	unsigned EncodedBytes = 0;
	const MachineOperand &CalleeMO = opers.getCallTarget();

	// Check for null target. If target is non-null (i.e. is non-zero or is
	// symbolic) then emit a call.
	if (!(CalleeMO.isImm() && !CalleeMO.getImm())) {
	MCOperand CalleeMCOp;
	switch (CalleeMO.getType()) {
	default:
	/// FIXME: Add a verifier check for bad callee types.
	llvm_unreachable("Unrecognized callee operand type.");
	case MachineOperand::MO_Immediate:
	if (CalleeMO.getImm())
	CalleeMCOp = MCOperand::createImm(CalleeMO.getImm());
	break;
	case MachineOperand::MO_ExternalSymbol:
	case MachineOperand::MO_GlobalAddress:
	CalleeMCOp =
	MCIL.LowerSymbolOperand(CalleeMO,
	MCIL.GetSymbolFromOperand(CalleeMO));
	break;
	}

	// Emit MOV to materialize the target address and the CALL to target.
	// This is encoded with 12-13 bytes, depending on which register is used.
	unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg();
	if (X86II::isX86_64ExtendedReg(ScratchReg))
	EncodedBytes = 13;
	else
	EncodedBytes = 12;

	EmitAndCountInstruction(
	MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
	+ // FIXME: Add retpoline support and remove this.
	+ if (Subtarget->useRetpoline())
	+ report_fatal_error(
	+ "Lowering patchpoint with retpoline not yet implemented.");
	EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
	}

	// Emit padding.
	unsigned NumBytes = opers.getNumPatchBytes();
	assert(NumBytes >= EncodedBytes &&
	"Patchpoint can't request size less than the length of a call.");

	EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(),
	getSubtargetInfo());
	}

	void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	assert(Subtarget->is64Bit() && "XRay custom events only supports X86-64");

	// We want to emit the following pattern, which follows the x86 calling
	// convention to prepare for the trampoline call to be patched in.
	//
	// .p2align 1, ...
	// .Lxray_event_sled_N:
	// jmp +N // jump across the instrumentation sled
	// ... // set up arguments in register
	// callq __xray_CustomEvent@plt // force dependency to symbol
	// ...
	// <jump here>
	//
	// After patching, it would look something like:
	//
	// nopw (2-byte nop)
	// ...
	// callq __xrayCustomEvent // already lowered
	// ...
	//
	// ---
	// First we emit the label and the jump.
	auto CurSled = OutContext.createTempSymbol("xray_event_sled_", true);
	OutStreamer->AddComment("# XRay Custom Event Log");
	OutStreamer->EmitCodeAlignment(2);
	OutStreamer->EmitLabel(CurSled);

	// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
	// an operand (computed as an offset from the jmp instruction).
	// FIXME: Find another less hacky way do force the relative jump.
	OutStreamer->EmitBinaryData("\xeb\x0f");

	// The default C calling convention will place two arguments into %rcx and
	// %rdx -- so we only work with those.
	unsigned UsedRegs[] = {X86::RDI, X86::RSI};
	bool UsedMask[] = {false, false};

	// Then we put the operands in the %rdi and %rsi registers. We spill the
	// values in the register before we clobber them, and mark them as used in
	// UsedMask. In case the arguments are already in the correct register, we use
	// emit nops appropriately sized to keep the sled the same size in every
	// situation.
	for (unsigned I = 0; I < MI.getNumOperands(); ++I)
	if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
	assert(Op->isReg() && "Only support arguments in registers");
	if (Op->getReg() != UsedRegs[I]) {
	UsedMask[I] = true;
	EmitAndCountInstruction(
	MCInstBuilder(X86::PUSH64r).addReg(UsedRegs[I]));
	EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
	.addReg(UsedRegs[I])
	.addReg(Op->getReg()));
	} else {
	EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
	}
	}

	// We emit a hard dependency on the __xray_CustomEvent symbol, which is the
	// name of the trampoline to be implemented by the XRay runtime.
	auto TSym = OutContext.getOrCreateSymbol("__xray_CustomEvent");
	MachineOperand TOp = MachineOperand::CreateMCSymbol(TSym);
	if (isPositionIndependent())
	TOp.setTargetFlags(X86II::MO_PLT);

	// Emit the call instruction.
	EmitAndCountInstruction(MCInstBuilder(X86::CALL64pcrel32)
	.addOperand(MCIL.LowerSymbolOperand(TOp, TSym)));

	// Restore caller-saved and used registers.
	for (unsigned I = sizeof UsedMask; I-- > 0;)
	if (UsedMask[I])
	EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(UsedRegs[I]));
	else
	EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());

	OutStreamer->AddComment("xray custom event end.");

	// Record the sled version. Older versions of this sled were spelled
	// differently, so we let the runtime handle the different offsets we're
	// using.
	recordSled(CurSled, MI, SledKind::CUSTOM_EVENT, 1);
	}

	void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	// We want to emit the following pattern:
	//
	// .p2align 1, ...
	// .Lxray_sled_N:
	// jmp .tmpN
	// # 9 bytes worth of noops
	//
	// We need the 9 bytes because at runtime, we'd be patching over the full 11
	// bytes with the following pattern:
	//
	// mov %r10, <function id, 32-bit> // 6 bytes
	// call <relative offset, 32-bits> // 5 bytes
	//
	auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
	OutStreamer->EmitCodeAlignment(2);
	OutStreamer->EmitLabel(CurSled);

	// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
	// an operand (computed as an offset from the jmp instruction).
	// FIXME: Find another less hacky way do force the relative jump.
	OutStreamer->EmitBytes("\xeb\x09");
	EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
	recordSled(CurSled, MI, SledKind::FUNCTION_ENTER);
	}

	void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	// Since PATCHABLE_RET takes the opcode of the return statement as an
	// argument, we use that to emit the correct form of the RET that we want.
	// i.e. when we see this:
	//
	// PATCHABLE_RET X86::RET ...
	//
	// We should emit the RET followed by sleds.
	//
	// .p2align 1, ...
	// .Lxray_sled_N:
	// ret # or equivalent instruction
	// # 10 bytes worth of noops
	//
	// This just makes sure that the alignment for the next instruction is 2.
	auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
	OutStreamer->EmitCodeAlignment(2);
	OutStreamer->EmitLabel(CurSled);
	unsigned OpCode = MI.getOperand(0).getImm();
	MCInst Ret;
	Ret.setOpcode(OpCode);
	for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
	if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
	Ret.addOperand(MaybeOperand.getValue());
	OutStreamer->EmitInstruction(Ret, getSubtargetInfo());
	EmitNops(*OutStreamer, 10, Subtarget->is64Bit(), getSubtargetInfo());
	recordSled(CurSled, MI, SledKind::FUNCTION_EXIT);
	}

	void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) {
	// Like PATCHABLE_RET, we have the actual instruction in the operands to this
	// instruction so we lower that particular instruction and its operands.
	// Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how
	// we do it for PATCHABLE_FUNCTION_ENTER. The sled should be very similar to
	// the PATCHABLE_FUNCTION_ENTER case, followed by the lowering of the actual
	// tail call much like how we have it in PATCHABLE_RET.
	auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
	OutStreamer->EmitCodeAlignment(2);
	OutStreamer->EmitLabel(CurSled);
	auto Target = OutContext.createTempSymbol();

	// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
	// an operand (computed as an offset from the jmp instruction).
	// FIXME: Find another less hacky way do force the relative jump.
	OutStreamer->EmitBytes("\xeb\x09");
	EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
	OutStreamer->EmitLabel(Target);
	recordSled(CurSled, MI, SledKind::TAIL_CALL);

	unsigned OpCode = MI.getOperand(0).getImm();
	MCInst TC;
	TC.setOpcode(OpCode);

	// Before emitting the instruction, add a comment to indicate that this is
	// indeed a tail call.
	OutStreamer->AddComment("TAILCALL");
	for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
	if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
	TC.addOperand(MaybeOperand.getValue());
	OutStreamer->EmitInstruction(TC, getSubtargetInfo());
	}

	// Returns instruction preceding MBBI in MachineFunction.
	// If MBBI is the first instruction of the first basic block, returns null.
	static MachineBasicBlock::const_iterator
	PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
	const MachineBasicBlock *MBB = MBBI->getParent();
	while (MBBI == MBB->begin()) {
	if (MBB == &MBB->getParent()->front())
	return MachineBasicBlock::const_iterator();
	MBB = MBB->getPrevNode();
	MBBI = MBB->end();
	}
	return --MBBI;
	}

	static const Constant *getConstantFromPool(const MachineInstr &MI,
	const MachineOperand &Op) {
	if (!Op.isCPI())
	return nullptr;

	ArrayRef<MachineConstantPoolEntry> Constants =
	MI.getParent()->getParent()->getConstantPool()->getConstants();
	const MachineConstantPoolEntry &ConstantEntry =
	Constants[Op.getIndex()];

	// Bail if this is a machine constant pool entry, we won't be able to dig out
	// anything useful.
	if (ConstantEntry.isMachineConstantPoolEntry())
	return nullptr;

	auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal);
	assert((!C \|\| ConstantEntry.getType() == C->getType()) &&
	"Expected a constant of the same type!");
	return C;
	}

	static std::string getShuffleComment(const MachineInstr *MI,
	unsigned SrcOp1Idx,
	unsigned SrcOp2Idx,
	ArrayRef<int> Mask) {
	std::string Comment;

	// Compute the name for a register. This is really goofy because we have
	// multiple instruction printers that could (in theory) use different
	// names. Fortunately most people use the ATT style (outside of Windows)
	// and they actually agree on register naming here. Ultimately, this is
	// a comment, and so its OK if it isn't perfect.
	auto GetRegisterName = [](unsigned RegNum) -> StringRef {
	return X86ATTInstPrinter::getRegisterName(RegNum);
	};

	const MachineOperand &DstOp = MI->getOperand(0);
	const MachineOperand &SrcOp1 = MI->getOperand(SrcOp1Idx);
	const MachineOperand &SrcOp2 = MI->getOperand(SrcOp2Idx);

	StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem";
	StringRef Src1Name =
	SrcOp1.isReg() ? GetRegisterName(SrcOp1.getReg()) : "mem";
	StringRef Src2Name =
	SrcOp2.isReg() ? GetRegisterName(SrcOp2.getReg()) : "mem";

	// One source operand, fix the mask to print all elements in one span.
	SmallVector<int, 8> ShuffleMask(Mask.begin(), Mask.end());
	if (Src1Name == Src2Name)
	for (int i = 0, e = ShuffleMask.size(); i != e; ++i)
	if (ShuffleMask[i] >= e)
	ShuffleMask[i] -= e;

	raw_string_ostream CS(Comment);
	CS << DstName;

	// Handle AVX512 MASK/MASXZ write mask comments.
	// MASK: zmmX {%kY}
	// MASKZ: zmmX {%kY} {z}
	if (SrcOp1Idx > 1) {
	assert((SrcOp1Idx == 2 \|\| SrcOp1Idx == 3) && "Unexpected writemask");

	const MachineOperand &WriteMaskOp = MI->getOperand(SrcOp1Idx - 1);
	if (WriteMaskOp.isReg()) {
	CS << " {%" << GetRegisterName(WriteMaskOp.getReg()) << "}";

	if (SrcOp1Idx == 2) {
	CS << " {z}";
	}
	}
	}

	CS << " = ";

	for (int i = 0, e = ShuffleMask.size(); i != e; ++i) {
	if (i != 0)
	CS << ",";
	if (ShuffleMask[i] == SM_SentinelZero) {
	CS << "zero";
	continue;
	}

	// Otherwise, it must come from src1 or src2. Print the span of elements
	// that comes from this src.
	bool isSrc1 = ShuffleMask[i] < (int)e;
	CS << (isSrc1 ? Src1Name : Src2Name) << '[';

	bool IsFirst = true;
	while (i != e && ShuffleMask[i] != SM_SentinelZero &&
	(ShuffleMask[i] < (int)e) == isSrc1) {
	if (!IsFirst)
	CS << ',';
	else
	IsFirst = false;
	if (ShuffleMask[i] == SM_SentinelUndef)
	CS << "u";
	else
	CS << ShuffleMask[i] % (int)e;
	++i;
	}
	CS << ']';
	--i; // For loop increments element #.
	}
	CS.flush();

	return Comment;
	}

	static void printConstant(const Constant *COp, raw_ostream &CS) {
	if (isa<UndefValue>(COp)) {
	CS << "u";
	} else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
	if (CI->getBitWidth() <= 64) {
	CS << CI->getZExtValue();
	} else {
	// print multi-word constant as (w0,w1)
	const auto &Val = CI->getValue();
	CS << "(";
	for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
	if (i > 0)
	CS << ",";
	CS << Val.getRawData()[i];
	}
	CS << ")";
	}
	} else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
	SmallString<32> Str;
	CF->getValueAPF().toString(Str);
	CS << Str;
	} else {
	CS << "?";
	}
	}

	void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
	assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
	assert(getSubtarget().isOSWindows() && "SEH_ instruction Windows only");
	const X86RegisterInfo *RI =
	MF->getSubtarget<X86Subtarget>().getRegisterInfo();

	// Use the .cv_fpo directives if we're emitting CodeView on 32-bit x86.
	if (EmitFPOData) {
	X86TargetStreamer *XTS =
	static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer());
	switch (MI->getOpcode()) {
	case X86::SEH_PushReg:
	XTS->emitFPOPushReg(MI->getOperand(0).getImm());
	break;
	case X86::SEH_StackAlloc:
	XTS->emitFPOStackAlloc(MI->getOperand(0).getImm());
	break;
	case X86::SEH_SetFrame:
	assert(MI->getOperand(1).getImm() == 0 &&
	".cv_fpo_setframe takes no offset");
	XTS->emitFPOSetFrame(MI->getOperand(0).getImm());
	break;
	case X86::SEH_EndPrologue:
	XTS->emitFPOEndPrologue();
	break;
	case X86::SEH_SaveReg:
	case X86::SEH_SaveXMM:
	case X86::SEH_PushFrame:
	llvm_unreachable("SEH_ directive incompatible with FPO");
	break;
	default:
	llvm_unreachable("expected SEH_ instruction");
	}
	return;
	}

	// Otherwise, use the .seh_ directives for all other Windows platforms.
	switch (MI->getOpcode()) {
	case X86::SEH_PushReg:
	OutStreamer->EmitWinCFIPushReg(
	RI->getSEHRegNum(MI->getOperand(0).getImm()));
	break;

	case X86::SEH_SaveReg:
	OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
	MI->getOperand(1).getImm());
	break;

	case X86::SEH_SaveXMM:
	OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
	MI->getOperand(1).getImm());
	break;

	case X86::SEH_StackAlloc:
	OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
	break;

	case X86::SEH_SetFrame:
	OutStreamer->EmitWinCFISetFrame(
	RI->getSEHRegNum(MI->getOperand(0).getImm()),
	MI->getOperand(1).getImm());
	break;

	case X86::SEH_PushFrame:
	OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
	break;

	case X86::SEH_EndPrologue:
	OutStreamer->EmitWinCFIEndProlog();
	break;

	default:
	llvm_unreachable("expected SEH_ instruction");
	}
	}

	void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
	X86MCInstLower MCInstLowering(MF, this);
	const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo();

	// Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
	// are compressed from EVEX encoding to VEX encoding.
	if (TM.Options.MCOptions.ShowMCEncoding) {
	if (MI->getAsmPrinterFlags() & AC_EVEX_2_VEX)
	OutStreamer->AddComment("EVEX TO VEX Compression ", false);
	}

	switch (MI->getOpcode()) {
	case TargetOpcode::DBG_VALUE:
	llvm_unreachable("Should be handled target independently");

	// Emit nothing here but a comment if we can.
	case X86::Int_MemBarrier:
	OutStreamer->emitRawComment("MEMBARRIER");
	return;


	case X86::EH_RETURN:
	case X86::EH_RETURN64: {
	// Lower these as normal, but add some comments.
	unsigned Reg = MI->getOperand(0).getReg();
	OutStreamer->AddComment(StringRef("eh_return, addr: %") +
	X86ATTInstPrinter::getRegisterName(Reg));
	break;
	}
	case X86::CLEANUPRET: {
	// Lower these as normal, but add some comments.
	OutStreamer->AddComment("CLEANUPRET");
	break;
	}

	case X86::CATCHRET: {
	// Lower these as normal, but add some comments.
	OutStreamer->AddComment("CATCHRET");
	break;
	}

	case X86::TAILJMPr:
	case X86::TAILJMPm:
	case X86::TAILJMPd:
	case X86::TAILJMPd_CC:
	case X86::TAILJMPr64:
	case X86::TAILJMPm64:
	case X86::TAILJMPd64:
	case X86::TAILJMPd64_CC:
	case X86::TAILJMPr64_REX:
	case X86::TAILJMPm64_REX:
	// Lower these as normal, but add some comments.
	OutStreamer->AddComment("TAILCALL");
	break;

	case X86::TLS_addr32:
	case X86::TLS_addr64:
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	return LowerTlsAddr(MCInstLowering, *MI);

	case X86::MOVPC32r: {
	// This is a pseudo op for a two instruction sequence with a label, which
	// looks like:
	// call "L1$pb"
	// "L1$pb":
	// popl %esi

	// Emit the call.
	MCSymbol *PICBase = MF->getPICBaseSymbol();
	// FIXME: We would like an efficient form for this, so we don't have to do a
	// lot of extra uniquing.
	EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32)
	.addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));

	const X86FrameLowering* FrameLowering =
	MF->getSubtarget<X86Subtarget>().getFrameLowering();
	bool hasFP = FrameLowering->hasFP(*MF);

	// TODO: This is needed only if we require precise CFA.
	bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
	!OutStreamer->getDwarfFrameInfos().back().End;

	int stackGrowth = -RI->getSlotSize();

	if (HasActiveDwarfFrame && !hasFP) {
	OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth);
	}

	// Emit the label.
	OutStreamer->EmitLabel(PICBase);

	// popl $reg
	EmitAndCountInstruction(MCInstBuilder(X86::POP32r)
	.addReg(MI->getOperand(0).getReg()));

	if (HasActiveDwarfFrame && !hasFP) {
	OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
	}
	return;
	}

	case X86::ADD32ri: {
	// Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
	if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
	break;

	// Okay, we have something like:
	// EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)

	// For this, we want to print something like:
	// MYGLOBAL + (. - PICBASE)
	// However, we can't generate a ".", so just emit a new label here and refer
	// to it.
	MCSymbol *DotSym = OutContext.createTempSymbol();
	OutStreamer->EmitLabel(DotSym);

	// Now that we have emitted the label, lower the complex operand expression.
	MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));

	const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
	const MCExpr *PICBase =
	MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
	DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);

	DotExpr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(OpSym,OutContext),
	DotExpr, OutContext);

	EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
	.addReg(MI->getOperand(0).getReg())
	.addReg(MI->getOperand(1).getReg())
	.addExpr(DotExpr));
	return;
	}
	case TargetOpcode::STATEPOINT:
	return LowerSTATEPOINT(*MI, MCInstLowering);

	case TargetOpcode::FAULTING_OP:
	return LowerFAULTING_OP(*MI, MCInstLowering);

	case TargetOpcode::FENTRY_CALL:
	return LowerFENTRY_CALL(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_OP:
	return LowerPATCHABLE_OP(*MI, MCInstLowering);

	case TargetOpcode::STACKMAP:
	return LowerSTACKMAP(*MI);

	case TargetOpcode::PATCHPOINT:
	return LowerPATCHPOINT(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
	return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_RET:
	return LowerPATCHABLE_RET(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_TAIL_CALL:
	return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_EVENT_CALL:
	return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering);

	case X86::MORESTACK_RET:
	EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
	return;

	case X86::MORESTACK_RET_RESTORE_R10:
	// Return, then restore R10.
	EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
	EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
	.addReg(X86::R10)
	.addReg(X86::RAX));
	return;

	case X86::SEH_PushReg:
	case X86::SEH_SaveReg:
	case X86::SEH_SaveXMM:
	case X86::SEH_StackAlloc:
	case X86::SEH_SetFrame:
	case X86::SEH_PushFrame:
	case X86::SEH_EndPrologue:
	EmitSEHInstruction(MI);
	return;

	case X86::SEH_Epilogue: {
	assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
	MachineBasicBlock::const_iterator MBBI(MI);
	// Check if preceded by a call and emit nop if so.
	for (MBBI = PrevCrossBBInst(MBBI);
	MBBI != MachineBasicBlock::const_iterator();
	MBBI = PrevCrossBBInst(MBBI)) {
	// Conservatively assume that pseudo instructions don't emit code and keep
	// looking for a call. We may emit an unnecessary nop in some cases.
	if (!MBBI->isPseudo()) {
	if (MBBI->isCall())
	EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
	break;
	}
	}
	return;
	}

	// Lower PSHUFB and VPERMILP normally but add a comment if we can find
	// a constant shuffle mask. We won't be able to do this at the MC layer
	// because the mask isn't an immediate.
	case X86::PSHUFBrm:
	case X86::VPSHUFBrm:
	case X86::VPSHUFBYrm:
	case X86::VPSHUFBZ128rm:
	case X86::VPSHUFBZ128rmk:
	case X86::VPSHUFBZ128rmkz:
	case X86::VPSHUFBZ256rm:
	case X86::VPSHUFBZ256rmk:
	case X86::VPSHUFBZ256rmkz:
	case X86::VPSHUFBZrm:
	case X86::VPSHUFBZrmk:
	case X86::VPSHUFBZrmkz: {
	if (!OutStreamer->isVerboseAsm())
	break;
	unsigned SrcIdx, MaskIdx;
	switch (MI->getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::PSHUFBrm:
	case X86::VPSHUFBrm:
	case X86::VPSHUFBYrm:
	case X86::VPSHUFBZ128rm:
	case X86::VPSHUFBZ256rm:
	case X86::VPSHUFBZrm:
	SrcIdx = 1; MaskIdx = 5; break;
	case X86::VPSHUFBZ128rmkz:
	case X86::VPSHUFBZ256rmkz:
	case X86::VPSHUFBZrmkz:
	SrcIdx = 2; MaskIdx = 6; break;
	case X86::VPSHUFBZ128rmk:
	case X86::VPSHUFBZ256rmk:
	case X86::VPSHUFBZrmk:
	SrcIdx = 3; MaskIdx = 7; break;
	}

	assert(MI->getNumOperands() >= 6 &&
	"We should always have at least 6 operands!");

	const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
	if (auto C = getConstantFromPool(MI, MaskOp)) {
	SmallVector<int, 64> Mask;
	DecodePSHUFBMask(C, Mask);
	if (!Mask.empty())
	OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
	!EnablePrintSchedInfo);
	}
	break;
	}

	case X86::VPERMILPSrm:
	case X86::VPERMILPSYrm:
	case X86::VPERMILPSZ128rm:
	case X86::VPERMILPSZ128rmk:
	case X86::VPERMILPSZ128rmkz:
	case X86::VPERMILPSZ256rm:
	case X86::VPERMILPSZ256rmk:
	case X86::VPERMILPSZ256rmkz:
	case X86::VPERMILPSZrm:
	case X86::VPERMILPSZrmk:
	case X86::VPERMILPSZrmkz:
	case X86::VPERMILPDrm:
	case X86::VPERMILPDYrm:
	case X86::VPERMILPDZ128rm:
	case X86::VPERMILPDZ128rmk:
	case X86::VPERMILPDZ128rmkz:
	case X86::VPERMILPDZ256rm:
	case X86::VPERMILPDZ256rmk:
	case X86::VPERMILPDZ256rmkz:
	case X86::VPERMILPDZrm:
	case X86::VPERMILPDZrmk:
	case X86::VPERMILPDZrmkz: {
	if (!OutStreamer->isVerboseAsm())
	break;
	unsigned SrcIdx, MaskIdx;
	unsigned ElSize;
	switch (MI->getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VPERMILPSrm:
	case X86::VPERMILPSYrm:
	case X86::VPERMILPSZ128rm:
	case X86::VPERMILPSZ256rm:
	case X86::VPERMILPSZrm:
	SrcIdx = 1; MaskIdx = 5; ElSize = 32; break;
	case X86::VPERMILPSZ128rmkz:
	case X86::VPERMILPSZ256rmkz:
	case X86::VPERMILPSZrmkz:
	SrcIdx = 2; MaskIdx = 6; ElSize = 32; break;
	case X86::VPERMILPSZ128rmk:
	case X86::VPERMILPSZ256rmk:
	case X86::VPERMILPSZrmk:
	SrcIdx = 3; MaskIdx = 7; ElSize = 32; break;
	case X86::VPERMILPDrm:
	case X86::VPERMILPDYrm:
	case X86::VPERMILPDZ128rm:
	case X86::VPERMILPDZ256rm:
	case X86::VPERMILPDZrm:
	SrcIdx = 1; MaskIdx = 5; ElSize = 64; break;
	case X86::VPERMILPDZ128rmkz:
	case X86::VPERMILPDZ256rmkz:
	case X86::VPERMILPDZrmkz:
	SrcIdx = 2; MaskIdx = 6; ElSize = 64; break;
	case X86::VPERMILPDZ128rmk:
	case X86::VPERMILPDZ256rmk:
	case X86::VPERMILPDZrmk:
	SrcIdx = 3; MaskIdx = 7; ElSize = 64; break;
	}

	assert(MI->getNumOperands() >= 6 &&
	"We should always have at least 6 operands!");

	const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
	if (auto C = getConstantFromPool(MI, MaskOp)) {
	SmallVector<int, 16> Mask;
	DecodeVPERMILPMask(C, ElSize, Mask);
	if (!Mask.empty())
	OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
	!EnablePrintSchedInfo);
	}
	break;
	}

	case X86::VPERMIL2PDrm:
	case X86::VPERMIL2PSrm:
	case X86::VPERMIL2PDYrm:
	case X86::VPERMIL2PSYrm: {
	if (!OutStreamer->isVerboseAsm())
	break;
	assert(MI->getNumOperands() >= 8 &&
	"We should always have at least 8 operands!");

	const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1);
	if (!CtrlOp.isImm())
	break;

	unsigned ElSize;
	switch (MI->getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VPERMIL2PSrm: case X86::VPERMIL2PSYrm: ElSize = 32; break;
	case X86::VPERMIL2PDrm: case X86::VPERMIL2PDYrm: ElSize = 64; break;
	}

	const MachineOperand &MaskOp = MI->getOperand(6);
	if (auto C = getConstantFromPool(MI, MaskOp)) {
	SmallVector<int, 16> Mask;
	DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
	if (!Mask.empty())
	OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
	!EnablePrintSchedInfo);
	}
	break;
	}

	case X86::VPPERMrrm: {
	if (!OutStreamer->isVerboseAsm())
	break;
	assert(MI->getNumOperands() >= 7 &&
	"We should always have at least 7 operands!");

	const MachineOperand &MaskOp = MI->getOperand(6);
	if (auto C = getConstantFromPool(MI, MaskOp)) {
	SmallVector<int, 16> Mask;
	DecodeVPPERMMask(C, Mask);
	if (!Mask.empty())
	OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
	!EnablePrintSchedInfo);
	}
	break;
	}

	#define MOV_CASE(Prefix, Suffix) \
	case X86::Prefix##MOVAPD##Suffix##rm: \
	case X86::Prefix##MOVAPS##Suffix##rm: \
	case X86::Prefix##MOVUPD##Suffix##rm: \
	case X86::Prefix##MOVUPS##Suffix##rm: \
	case X86::Prefix##MOVDQA##Suffix##rm: \
	case X86::Prefix##MOVDQU##Suffix##rm:

	#define MOV_AVX512_CASE(Suffix) \
	case X86::VMOVDQA64##Suffix##rm: \
	case X86::VMOVDQA32##Suffix##rm: \
	case X86::VMOVDQU64##Suffix##rm: \
	case X86::VMOVDQU32##Suffix##rm: \
	case X86::VMOVDQU16##Suffix##rm: \
	case X86::VMOVDQU8##Suffix##rm: \
	case X86::VMOVAPS##Suffix##rm: \
	case X86::VMOVAPD##Suffix##rm: \
	case X86::VMOVUPS##Suffix##rm: \
	case X86::VMOVUPD##Suffix##rm:

	#define CASE_ALL_MOV_RM() \
	MOV_CASE(, ) /* SSE */ \
	MOV_CASE(V, ) /* AVX-128 */ \
	MOV_CASE(V, Y) /* AVX-256 */ \
	MOV_AVX512_CASE(Z) \
	MOV_AVX512_CASE(Z256) \
	MOV_AVX512_CASE(Z128)

	// For loads from a constant pool to a vector register, print the constant
	// loaded.
	CASE_ALL_MOV_RM()
	case X86::VBROADCASTF128:
	case X86::VBROADCASTI128:
	case X86::VBROADCASTF32X4Z256rm:
	case X86::VBROADCASTF32X4rm:
	case X86::VBROADCASTF32X8rm:
	case X86::VBROADCASTF64X2Z128rm:
	case X86::VBROADCASTF64X2rm:
	case X86::VBROADCASTF64X4rm:
	case X86::VBROADCASTI32X4Z256rm:
	case X86::VBROADCASTI32X4rm:
	case X86::VBROADCASTI32X8rm:
	case X86::VBROADCASTI64X2Z128rm:
	case X86::VBROADCASTI64X2rm:
	case X86::VBROADCASTI64X4rm:
	if (!OutStreamer->isVerboseAsm())
	break;
	if (MI->getNumOperands() <= 4)
	break;
	if (auto C = getConstantFromPool(MI, MI->getOperand(4))) {
	int NumLanes = 1;
	// Override NumLanes for the broadcast instructions.
	switch (MI->getOpcode()) {
	case X86::VBROADCASTF128: NumLanes = 2; break;
	case X86::VBROADCASTI128: NumLanes = 2; break;
	case X86::VBROADCASTF32X4Z256rm: NumLanes = 2; break;
	case X86::VBROADCASTF32X4rm: NumLanes = 4; break;
	case X86::VBROADCASTF32X8rm: NumLanes = 2; break;
	case X86::VBROADCASTF64X2Z128rm: NumLanes = 2; break;
	case X86::VBROADCASTF64X2rm: NumLanes = 4; break;
	case X86::VBROADCASTF64X4rm: NumLanes = 2; break;
	case X86::VBROADCASTI32X4Z256rm: NumLanes = 2; break;
	case X86::VBROADCASTI32X4rm: NumLanes = 4; break;
	case X86::VBROADCASTI32X8rm: NumLanes = 2; break;
	case X86::VBROADCASTI64X2Z128rm: NumLanes = 2; break;
	case X86::VBROADCASTI64X2rm: NumLanes = 4; break;
	case X86::VBROADCASTI64X4rm: NumLanes = 2; break;
	}

	std::string Comment;
	raw_string_ostream CS(Comment);
	const MachineOperand &DstOp = MI->getOperand(0);
	CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
	if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
	CS << "[";
	for (int l = 0; l != NumLanes; ++l) {
	for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) {
	if (i != 0 \|\| l != 0)
	CS << ",";
	if (CDS->getElementType()->isIntegerTy())
	CS << CDS->getElementAsInteger(i);
	else if (CDS->getElementType()->isFloatTy())
	CS << CDS->getElementAsFloat(i);
	else if (CDS->getElementType()->isDoubleTy())
	CS << CDS->getElementAsDouble(i);
	else
	CS << "?";
	}
	}
	CS << "]";
	OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
	} else if (auto *CV = dyn_cast<ConstantVector>(C)) {
	CS << "<";
	for (int l = 0; l != NumLanes; ++l) {
	for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) {
	if (i != 0 \|\| l != 0)
	CS << ",";
	printConstant(CV->getOperand(i), CS);
	}
	}
	CS << ">";
	OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
	}
	}
	break;
	case X86::VBROADCASTSSrm:
	case X86::VBROADCASTSSYrm:
	case X86::VBROADCASTSSZ128m:
	case X86::VBROADCASTSSZ256m:
	case X86::VBROADCASTSSZm:
	case X86::VBROADCASTSDYrm:
	case X86::VBROADCASTSDZ256m:
	case X86::VBROADCASTSDZm:
	case X86::VPBROADCASTBrm:
	case X86::VPBROADCASTBYrm:
	case X86::VPBROADCASTBZ128m:
	case X86::VPBROADCASTBZ256m:
	case X86::VPBROADCASTBZm:
	case X86::VPBROADCASTDrm:
	case X86::VPBROADCASTDYrm:
	case X86::VPBROADCASTDZ128m:
	case X86::VPBROADCASTDZ256m:
	case X86::VPBROADCASTDZm:
	case X86::VPBROADCASTQrm:
	case X86::VPBROADCASTQYrm:
	case X86::VPBROADCASTQZ128m:
	case X86::VPBROADCASTQZ256m:
	case X86::VPBROADCASTQZm:
	case X86::VPBROADCASTWrm:
	case X86::VPBROADCASTWYrm:
	case X86::VPBROADCASTWZ128m:
	case X86::VPBROADCASTWZ256m:
	case X86::VPBROADCASTWZm:
	if (!OutStreamer->isVerboseAsm())
	break;
	if (MI->getNumOperands() <= 4)
	break;
	if (auto C = getConstantFromPool(MI, MI->getOperand(4))) {
	int NumElts;
	switch (MI->getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VBROADCASTSSrm: NumElts = 4; break;
	case X86::VBROADCASTSSYrm: NumElts = 8; break;
	case X86::VBROADCASTSSZ128m: NumElts = 4; break;
	case X86::VBROADCASTSSZ256m: NumElts = 8; break;
	case X86::VBROADCASTSSZm: NumElts = 16; break;
	case X86::VBROADCASTSDYrm: NumElts = 4; break;
	case X86::VBROADCASTSDZ256m: NumElts = 4; break;
	case X86::VBROADCASTSDZm: NumElts = 8; break;
	case X86::VPBROADCASTBrm: NumElts = 16; break;
	case X86::VPBROADCASTBYrm: NumElts = 32; break;
	case X86::VPBROADCASTBZ128m: NumElts = 16; break;
	case X86::VPBROADCASTBZ256m: NumElts = 32; break;
	case X86::VPBROADCASTBZm: NumElts = 64; break;
	case X86::VPBROADCASTDrm: NumElts = 4; break;
	case X86::VPBROADCASTDYrm: NumElts = 8; break;
	case X86::VPBROADCASTDZ128m: NumElts = 4; break;
	case X86::VPBROADCASTDZ256m: NumElts = 8; break;
	case X86::VPBROADCASTDZm: NumElts = 16; break;
	case X86::VPBROADCASTQrm: NumElts = 2; break;
	case X86::VPBROADCASTQYrm: NumElts = 4; break;
	case X86::VPBROADCASTQZ128m: NumElts = 2; break;
	case X86::VPBROADCASTQZ256m: NumElts = 4; break;
	case X86::VPBROADCASTQZm: NumElts = 8; break;
	case X86::VPBROADCASTWrm: NumElts = 8; break;
	case X86::VPBROADCASTWYrm: NumElts = 16; break;
	case X86::VPBROADCASTWZ128m: NumElts = 8; break;
	case X86::VPBROADCASTWZ256m: NumElts = 16; break;
	case X86::VPBROADCASTWZm: NumElts = 32; break;
	}

	std::string Comment;
	raw_string_ostream CS(Comment);
	const MachineOperand &DstOp = MI->getOperand(0);
	CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
	CS << "[";
	for (int i = 0; i != NumElts; ++i) {
	if (i != 0)
	CS << ",";
	printConstant(C, CS);
	}
	CS << "]";
	OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
	}
	}

	MCInst TmpInst;
	MCInstLowering.Lower(MI, TmpInst);
	if (MI->getAsmPrinterFlag(MachineInstr::NoSchedComment))
	TmpInst.setFlags(TmpInst.getFlags() \| X86::NO_SCHED_INFO);

	// Stackmap shadows cannot include branch targets, so we can count the bytes
	// in a call towards the shadow, but must ensure that the no thread returns
	// in to the stackmap shadow. The only way to achieve this is if the call
	// is at the end of the shadow.
	if (MI->isCall()) {
	// Count then size of the call towards the shadow
	SMShadowTracker.count(TmpInst, getSubtargetInfo(), CodeEmitter.get());
	// Then flush the shadow so that we fill with nops before the call, not
	// after it.
	SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
	// Then emit the call
	OutStreamer->EmitInstruction(TmpInst, getSubtargetInfo());
	return;
	}

	EmitAndCountInstruction(TmpInst);
	}
	Index: head/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp (nonexistent)
	+++ head/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp (revision 328817)
	@@ -0,0 +1,311 @@
	+//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86 --=====//
	+//
	+// The LLVM Compiler Infrastructure
	+//
	+// This file is distributed under the University of Illinois Open Source
	+// License. See LICENSE.TXT for details.
	+//
	+//===----------------------------------------------------------------------===//
	+/// \file
	+///
	+/// Pass that injects an MI thunk implementing a "retpoline". This is
	+/// a RET-implemented trampoline that is used to lower indirect calls in a way
	+/// that prevents speculation on some x86 processors and can be used to mitigate
	+/// security vulnerabilities due to targeted speculative execution and side
	+/// channels such as CVE-2017-5715.
	+///
	+/// TODO(chandlerc): All of this code could use better comments and
	+/// documentation.
	+///
	+//===----------------------------------------------------------------------===//
	+
	+#include "X86.h"
	+#include "X86InstrBuilder.h"
	+#include "X86Subtarget.h"
	+#include "llvm/CodeGen/MachineFunction.h"
	+#include "llvm/CodeGen/MachineInstrBuilder.h"
	+#include "llvm/CodeGen/MachineModuleInfo.h"
	+#include "llvm/CodeGen/Passes.h"
	+#include "llvm/CodeGen/TargetPassConfig.h"
	+#include "llvm/IR/IRBuilder.h"
	+#include "llvm/IR/Instructions.h"
	+#include "llvm/IR/Module.h"
	+#include "llvm/Support/CommandLine.h"
	+#include "llvm/Support/Debug.h"
	+#include "llvm/Support/raw_ostream.h"
	+
	+using namespace llvm;
	+
	+#define DEBUG_TYPE "x86-retpoline-thunks"
	+
	+static const char ThunkNamePrefix[] = "__llvm_retpoline_";
	+static const char R11ThunkName[] = "__llvm_retpoline_r11";
	+static const char EAXThunkName[] = "__llvm_retpoline_eax";
	+static const char ECXThunkName[] = "__llvm_retpoline_ecx";
	+static const char EDXThunkName[] = "__llvm_retpoline_edx";
	+static const char PushThunkName[] = "__llvm_retpoline_push";
	+
	+namespace {
	+class X86RetpolineThunks : public MachineFunctionPass {
	+public:
	+ static char ID;
	+
	+ X86RetpolineThunks() : MachineFunctionPass(ID) {}
	+
	+ StringRef getPassName() const override { return "X86 Retpoline Thunks"; }
	+
	+ bool doInitialization(Module &M) override;
	+ bool runOnMachineFunction(MachineFunction &F) override;
	+
	+ void getAnalysisUsage(AnalysisUsage &AU) const override {
	+ MachineFunctionPass::getAnalysisUsage(AU);
	+ AU.addRequired<MachineModuleInfo>();
	+ AU.addPreserved<MachineModuleInfo>();
	+ }
	+
	+private:
	+ MachineModuleInfo *MMI;
	+ const TargetMachine *TM;
	+ bool Is64Bit;
	+ const X86Subtarget *STI;
	+ const X86InstrInfo *TII;
	+
	+ bool InsertedThunks;
	+
	+ void createThunkFunction(Module &M, StringRef Name);
	+ void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
	+ void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB);
	+ void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None);
	+};
	+
	+} // end anonymous namespace
	+
	+FunctionPass *llvm::createX86RetpolineThunksPass() {
	+ return new X86RetpolineThunks();
	+}
	+
	+char X86RetpolineThunks::ID = 0;
	+
	+bool X86RetpolineThunks::doInitialization(Module &M) {
	+ InsertedThunks = false;
	+ return false;
	+}
	+
	+bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
	+ DEBUG(dbgs() << getPassName() << '\n');
	+
	+ TM = &MF.getTarget();;
	+ STI = &MF.getSubtarget<X86Subtarget>();
	+ TII = STI->getInstrInfo();
	+ Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64;
	+
	+ MMI = &getAnalysis<MachineModuleInfo>();
	+ Module &M = const_cast<Module &>(*MMI->getModule());
	+
	+ // If this function is not a thunk, check to see if we need to insert
	+ // a thunk.
	+ if (!MF.getName().startswith(ThunkNamePrefix)) {
	+ // If we've already inserted a thunk, nothing else to do.
	+ if (InsertedThunks)
	+ return false;
	+
	+ // Only add a thunk if one of the functions has the retpoline feature
	+ // enabled in its subtarget, and doesn't enable external thunks.
	+ // FIXME: Conditionalize on indirect calls so we don't emit a thunk when
	+ // nothing will end up calling it.
	+ // FIXME: It's a little silly to look at every function just to enumerate
	+ // the subtargets, but eventually we'll want to look at them for indirect
	+ // calls, so maybe this is OK.
	+ if (!STI->useRetpoline() \|\| STI->useRetpolineExternalThunk())
	+ return false;
	+
	+ // Otherwise, we need to insert the thunk.
	+ // WARNING: This is not really a well behaving thing to do in a function
	+ // pass. We extract the module and insert a new function (and machine
	+ // function) directly into the module.
	+ if (Is64Bit)
	+ createThunkFunction(M, R11ThunkName);
	+ else
	+ for (StringRef Name :
	+ {EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName})
	+ createThunkFunction(M, Name);
	+ InsertedThunks = true;
	+ return true;
	+ }
	+
	+ // If this is a thunk function, we need to populate it with the correct MI.
	+ if (Is64Bit) {
	+ assert(MF.getName() == "__llvm_retpoline_r11" &&
	+ "Should only have an r11 thunk on 64-bit targets");
	+
	+ // __llvm_retpoline_r11:
	+ // callq .Lr11_call_target
	+ // .Lr11_capture_spec:
	+ // pause
	+ // lfence
	+ // jmp .Lr11_capture_spec
	+ // .align 16
	+ // .Lr11_call_target:
	+ // movq %r11, (%rsp)
	+ // retq
	+ populateThunk(MF, X86::R11);
	+ } else {
	+ // For 32-bit targets we need to emit a collection of thunks for various
	+ // possible scratch registers as well as a fallback that is used when
	+ // there are no scratch registers and assumes the retpoline target has
	+ // been pushed.
	+ // __llvm_retpoline_eax:
	+ // calll .Leax_call_target
	+ // .Leax_capture_spec:
	+ // pause
	+ // jmp .Leax_capture_spec
	+ // .align 16
	+ // .Leax_call_target:
	+ // movl %eax, (%esp) # Clobber return addr
	+ // retl
	+ //
	+ // __llvm_retpoline_ecx:
	+ // ... # Same setup
	+ // movl %ecx, (%esp)
	+ // retl
	+ //
	+ // __llvm_retpoline_edx:
	+ // ... # Same setup
	+ // movl %edx, (%esp)
	+ // retl
	+ //
	+ // This last one is a bit more special and so needs a little extra
	+ // handling.
	+ // __llvm_retpoline_push:
	+ // calll .Lpush_call_target
	+ // .Lpush_capture_spec:
	+ // pause
	+ // lfence
	+ // jmp .Lpush_capture_spec
	+ // .align 16
	+ // .Lpush_call_target:
	+ // # Clear pause_loop return address.
	+ // addl $4, %esp
	+ // # Top of stack words are: Callee, RA. Exchange Callee and RA.
	+ // pushl 4(%esp) # Push callee
	+ // pushl 4(%esp) # Push RA
	+ // popl 8(%esp) # Pop RA to final RA
	+ // popl (%esp) # Pop callee to next top of stack
	+ // retl # Ret to callee
	+ if (MF.getName() == EAXThunkName)
	+ populateThunk(MF, X86::EAX);
	+ else if (MF.getName() == ECXThunkName)
	+ populateThunk(MF, X86::ECX);
	+ else if (MF.getName() == EDXThunkName)
	+ populateThunk(MF, X86::EDX);
	+ else if (MF.getName() == PushThunkName)
	+ populateThunk(MF);
	+ else
	+ llvm_unreachable("Invalid thunk name on x86-32!");
	+ }
	+
	+ return true;
	+}
	+
	+void X86RetpolineThunks::createThunkFunction(Module &M, StringRef Name) {
	+ assert(Name.startswith(ThunkNamePrefix) &&
	+ "Created a thunk with an unexpected prefix!");
	+
	+ LLVMContext &Ctx = M.getContext();
	+ auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
	+ Function *F =
	+ Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M);
	+ F->setVisibility(GlobalValue::HiddenVisibility);
	+ F->setComdat(M.getOrInsertComdat(Name));
	+
	+ // Add Attributes so that we don't create a frame, unwind information, or
	+ // inline.
	+ AttrBuilder B;
	+ B.addAttribute(llvm::Attribute::NoUnwind);
	+ B.addAttribute(llvm::Attribute::Naked);
	+ F->addAttributes(llvm::AttributeList::FunctionIndex, B);
	+
	+ // Populate our function a bit so that we can verify.
	+ BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F);
	+ IRBuilder<> Builder(Entry);
	+
	+ Builder.CreateRetVoid();
	+}
	+
	+void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
	+ unsigned Reg) {
	+ const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
	+ const unsigned SPReg = Is64Bit ? X86::RSP : X86::ESP;
	+ addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(MovOpc)), SPReg, false, 0)
	+ .addReg(Reg);
	+}
	+
	+void X86RetpolineThunks::insert32BitPushReturnAddrClobber(
	+ MachineBasicBlock &MBB) {
	+ // The instruction sequence we use to replace the return address without
	+ // a scratch register is somewhat complicated:
	+ // # Clear capture_spec from return address.
	+ // addl $4, %esp
	+ // # Top of stack words are: Callee, RA. Exchange Callee and RA.
	+ // pushl 4(%esp) # Push callee
	+ // pushl 4(%esp) # Push RA
	+ // popl 8(%esp) # Pop RA to final RA
	+ // popl (%esp) # Pop callee to next top of stack
	+ // retl # Ret to callee
	+ BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP)
	+ .addReg(X86::ESP)
	+ .addImm(4);
	+ addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
	+ false, 4);
	+ addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
	+ false, 4);
	+ addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
	+ false, 8);
	+ addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
	+ false, 0);
	+}
	+
	+void X86RetpolineThunks::populateThunk(MachineFunction &MF,
	+ Optional<unsigned> Reg) {
	+ // Set MF properties. We never use vregs...
	+ MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
	+
	+ MachineBasicBlock *Entry = &MF.front();
	+ Entry->clear();
	+
	+ MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
	+ MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
	+ MF.push_back(CaptureSpec);
	+ MF.push_back(CallTarget);
	+
	+ const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
	+ const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;
	+
	+ BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addMBB(CallTarget);
	+ Entry->addSuccessor(CallTarget);
	+ Entry->addSuccessor(CaptureSpec);
	+ CallTarget->setHasAddressTaken();
	+
	+ // In the capture loop for speculation, we want to stop the processor from
	+ // speculating as fast as possible. On Intel processors, the PAUSE instruction
	+ // will block speculation without consuming any execution resources. On AMD
	+ // processors, the PAUSE instruction is (essentially) a nop, so we also use an
	+ // LFENCE instruction which they have advised will stop speculation as well
	+ // with minimal resource utilization. We still end the capture with a jump to
	+ // form an infinite loop to fully guarantee that no matter what implementation
	+ // of the x86 ISA, speculating this code path never escapes.
	+ BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE));
	+ BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE));
	+ BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec);
	+ CaptureSpec->setHasAddressTaken();
	+ CaptureSpec->addSuccessor(CaptureSpec);
	+
	+ CallTarget->setAlignment(4);
	+ if (Reg) {
	+ insertRegReturnAddrClobber(CallTarget, Reg);
	+ } else {
	+ assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!");
	+ insert32BitPushReturnAddrClobber(*CallTarget);
	+ }
	+ BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
	+}

	Property changes on: head/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86Subtarget.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86Subtarget.cpp (revision 328817)
	@@ -1,408 +1,410 @@
	//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the X86 specific subclass of TargetSubtargetInfo.
	//
	//===----------------------------------------------------------------------===//

	#include "X86.h"

	#include "X86CallLowering.h"
	#include "X86LegalizerInfo.h"
	#include "X86RegisterBankInfo.h"
	#include "X86Subtarget.h"
	#include "MCTargetDesc/X86BaseInfo.h"
	#include "X86TargetMachine.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/CodeGen/GlobalISel/CallLowering.h"
	#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"

	#if defined(_MSC_VER)
	#include <intrin.h>
	#endif

	using namespace llvm;

	#define DEBUG_TYPE "subtarget"

	#define GET_SUBTARGETINFO_TARGET_DESC
	#define GET_SUBTARGETINFO_CTOR
	#include "X86GenSubtargetInfo.inc"

	// Temporary option to control early if-conversion for x86 while adding machine
	// models.
	static cl::opt<bool>
	X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
	cl::desc("Enable early if-conversion on X86"));


	/// Classify a blockaddress reference for the current subtarget according to how
	/// we should reference it in a non-pcrel context.
	unsigned char X86Subtarget::classifyBlockAddressReference() const {
	return classifyLocalReference(nullptr);
	}

	/// Classify a global variable reference for the current subtarget according to
	/// how we should reference it in a non-pcrel context.
	unsigned char
	X86Subtarget::classifyGlobalReference(const GlobalValue *GV) const {
	return classifyGlobalReference(GV, *GV->getParent());
	}

	unsigned char
	X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
	// 64 bits can use %rip addressing for anything local.
	if (is64Bit())
	return X86II::MO_NO_FLAG;

	// If this is for a position dependent executable, the static linker can
	// figure it out.
	if (!isPositionIndependent())
	return X86II::MO_NO_FLAG;

	// The COFF dynamic linker just patches the executable sections.
	if (isTargetCOFF())
	return X86II::MO_NO_FLAG;

	if (isTargetDarwin()) {
	// 32 bit macho has no relocation for a-b if a is undefined, even if
	// b is in the section that is being relocated.
	// This means we have to use o load even for GVs that are known to be
	// local to the dso.
	if (GV && (GV->isDeclarationForLinker() \|\| GV->hasCommonLinkage()))
	return X86II::MO_DARWIN_NONLAZY_PIC_BASE;

	return X86II::MO_PIC_BASE_OFFSET;
	}

	return X86II::MO_GOTOFF;
	}

	unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
	const Module &M) const {
	// Large model never uses stubs.
	if (TM.getCodeModel() == CodeModel::Large)
	return X86II::MO_NO_FLAG;

	// Absolute symbols can be referenced directly.
	if (GV) {
	if (Optional<ConstantRange> CR = GV->getAbsoluteSymbolRange()) {
	// See if we can use the 8-bit immediate form. Note that some instructions
	// will sign extend the immediate operand, so to be conservative we only
	// accept the range [0,128).
	if (CR->getUnsignedMax().ult(128))
	return X86II::MO_ABS8;
	else
	return X86II::MO_NO_FLAG;
	}
	}

	if (TM.shouldAssumeDSOLocal(M, GV))
	return classifyLocalReference(GV);

	if (isTargetCOFF())
	return X86II::MO_DLLIMPORT;

	if (is64Bit())
	return X86II::MO_GOTPCREL;

	if (isTargetDarwin()) {
	if (!isPositionIndependent())
	return X86II::MO_DARWIN_NONLAZY;
	return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
	}

	return X86II::MO_GOT;
	}

	unsigned char
	X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV) const {
	return classifyGlobalFunctionReference(GV, *GV->getParent());
	}

	unsigned char
	X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
	const Module &M) const {
	if (TM.shouldAssumeDSOLocal(M, GV))
	return X86II::MO_NO_FLAG;

	if (isTargetCOFF()) {
	assert(GV->hasDLLImportStorageClass() &&
	"shouldAssumeDSOLocal gave inconsistent answer");
	return X86II::MO_DLLIMPORT;
	}

	const Function *F = dyn_cast_or_null<Function>(GV);

	if (isTargetELF()) {
	if (is64Bit() && F && (CallingConv::X86_RegCall == F->getCallingConv()))
	// According to psABI, PLT stub clobbers XMM8-XMM15.
	// In Regcall calling convention those registers are used for passing
	// parameters. Thus we need to prevent lazy binding in Regcall.
	return X86II::MO_GOTPCREL;
	if (F && F->hasFnAttribute(Attribute::NonLazyBind) && is64Bit())
	return X86II::MO_GOTPCREL;
	return X86II::MO_PLT;
	}

	if (is64Bit()) {
	if (F && F->hasFnAttribute(Attribute::NonLazyBind))
	// If the function is marked as non-lazy, generate an indirect call
	// which loads from the GOT directly. This avoids runtime overhead
	// at the cost of eager binding (and one extra byte of encoding).
	return X86II::MO_GOTPCREL;
	return X86II::MO_NO_FLAG;
	}

	return X86II::MO_NO_FLAG;
	}

	/// Return true if the subtarget allows calls to immediate address.
	bool X86Subtarget::isLegalToCallImmediateAddr() const {
	// FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
	// but WinCOFFObjectWriter::RecordRelocation cannot emit them. Once it does,
	// the following check for Win32 should be removed.
	if (In64BitMode \|\| isTargetWin32())
	return false;
	return isTargetELF() \|\| TM.getRelocationModel() == Reloc::Static;
	}

	void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
	std::string CPUName = CPU;
	if (CPUName.empty())
	CPUName = "generic";

	// Make sure 64-bit features are available in 64-bit mode. (But make sure
	// SSE2 can be turned off explicitly.)
	std::string FullFS = FS;
	if (In64BitMode) {
	if (!FullFS.empty())
	FullFS = "+64bit,+sse2," + FullFS;
	else
	FullFS = "+64bit,+sse2";
	}

	// LAHF/SAHF are always supported in non-64-bit mode.
	if (!In64BitMode) {
	if (!FullFS.empty())
	FullFS = "+sahf," + FullFS;
	else
	FullFS = "+sahf";
	}

	// Parse features string and set the CPU.
	ParseSubtargetFeatures(CPUName, FullFS);

	// All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of
	// 16-bytes and under that are reasonably fast. These features were
	// introduced with Intel's Nehalem/Silvermont and AMD's Family10h
	// micro-architectures respectively.
	if (hasSSE42() \|\| hasSSE4A())
	IsUAMem16Slow = false;

	InstrItins = getInstrItineraryForCPU(CPUName);

	// It's important to keep the MCSubtargetInfo feature bits in sync with
	// target data structure which is shared with MC code emitter, etc.
	if (In64BitMode)
	ToggleFeature(X86::Mode64Bit);
	else if (In32BitMode)
	ToggleFeature(X86::Mode32Bit);
	else if (In16BitMode)
	ToggleFeature(X86::Mode16Bit);
	else
	llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!");

	DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
	<< ", 3DNowLevel " << X863DNowLevel
	<< ", 64bit " << HasX86_64 << "\n");
	assert((!In64BitMode \|\| HasX86_64) &&
	"64-bit code requested on a subtarget that doesn't support it!");

	// Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both
	// 32 and 64 bit) and for all 64-bit targets.
	if (StackAlignOverride)
	stackAlignment = StackAlignOverride;
	else if (isTargetDarwin() \|\| isTargetLinux() \|\| isTargetSolaris() \|\|
	isTargetKFreeBSD() \|\| In64BitMode)
	stackAlignment = 16;

	// Some CPUs have more overhead for gather. The specified overhead is relative
	// to the Load operation. "2" is the number provided by Intel architects. This
	// parameter is used for cost estimation of Gather Op and comparison with
	// other alternatives.
	// TODO: Remove the explicit hasAVX512()?, That would mean we would only
	// enable gather with a -march.
	if (hasAVX512() \|\| (hasAVX2() && hasFastGather()))
	GatherOverhead = 2;
	if (hasAVX512())
	ScatterOverhead = 2;
	}

	void X86Subtarget::initializeEnvironment() {
	X86SSELevel = NoSSE;
	X863DNowLevel = NoThreeDNow;
	HasX87 = false;
	HasCMov = false;
	HasX86_64 = false;
	HasPOPCNT = false;
	HasSSE4A = false;
	HasAES = false;
	HasVAES = false;
	HasFXSR = false;
	HasXSAVE = false;
	HasXSAVEOPT = false;
	HasXSAVEC = false;
	HasXSAVES = false;
	HasPCLMUL = false;
	HasVPCLMULQDQ = false;
	HasGFNI = false;
	HasFMA = false;
	HasFMA4 = false;
	HasXOP = false;
	HasTBM = false;
	HasLWP = false;
	HasMOVBE = false;
	HasRDRAND = false;
	HasF16C = false;
	HasFSGSBase = false;
	HasLZCNT = false;
	HasBMI = false;
	HasBMI2 = false;
	HasVBMI = false;
	HasVBMI2 = false;
	HasIFMA = false;
	HasRTM = false;
	HasERI = false;
	HasCDI = false;
	HasPFI = false;
	HasDQI = false;
	HasVPOPCNTDQ = false;
	HasBWI = false;
	HasVLX = false;
	HasADX = false;
	HasPKU = false;
	HasVNNI = false;
	HasBITALG = false;
	HasSHA = false;
	HasPREFETCHWT1 = false;
	HasPRFCHW = false;
	HasRDSEED = false;
	HasLAHFSAHF = false;
	HasMWAITX = false;
	HasCLZERO = false;
	HasMPX = false;
	HasSHSTK = false;
	HasIBT = false;
	HasSGX = false;
	HasCLFLUSHOPT = false;
	HasCLWB = false;
	+ UseRetpoline = false;
	+ UseRetpolineExternalThunk = false;
	IsPMULLDSlow = false;
	IsSHLDSlow = false;
	IsUAMem16Slow = false;
	IsUAMem32Slow = false;
	HasSSEUnalignedMem = false;
	HasCmpxchg16b = false;
	UseLeaForSP = false;
	HasFastVariableShuffle = false;
	HasFastPartialYMMorZMMWrite = false;
	HasFastGather = false;
	HasFastScalarFSQRT = false;
	HasFastVectorFSQRT = false;
	HasFastLZCNT = false;
	HasFastSHLDRotate = false;
	HasMacroFusion = false;
	HasERMSB = false;
	HasSlowDivide32 = false;
	HasSlowDivide64 = false;
	PadShortFunctions = false;
	SlowTwoMemOps = false;
	LEAUsesAG = false;
	SlowLEA = false;
	Slow3OpsLEA = false;
	SlowIncDec = false;
	stackAlignment = 4;
	// FIXME: this is a known good value for Yonah. How about others?
	MaxInlineSizeThreshold = 128;
	UseSoftFloat = false;
	X86ProcFamily = Others;
	GatherOverhead = 1024;
	ScatterOverhead = 1024;
	}

	X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
	StringRef FS) {
	initializeEnvironment();
	initSubtargetFeatures(CPU, FS);
	return *this;
	}

	X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
	const X86TargetMachine &TM,
	unsigned StackAlignOverride)
	: X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
	PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),
	StackAlignOverride(StackAlignOverride),
	In64BitMode(TargetTriple.getArch() == Triple::x86_64),
	In32BitMode(TargetTriple.getArch() == Triple::x86 &&
	TargetTriple.getEnvironment() != Triple::CODE16),
	In16BitMode(TargetTriple.getArch() == Triple::x86 &&
	TargetTriple.getEnvironment() == Triple::CODE16),
	InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
	FrameLowering(*this, getStackAlignment()) {
	// Determine the PICStyle based on the target selected.
	if (!isPositionIndependent())
	setPICStyle(PICStyles::None);
	else if (is64Bit())
	setPICStyle(PICStyles::RIPRel);
	else if (isTargetCOFF())
	setPICStyle(PICStyles::None);
	else if (isTargetDarwin())
	setPICStyle(PICStyles::StubPIC);
	else if (isTargetELF())
	setPICStyle(PICStyles::GOT);

	CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering()));
	Legalizer.reset(new X86LegalizerInfo(*this, TM));

	auto RBI = new X86RegisterBankInfo(getRegisterInfo());
	RegBankInfo.reset(RBI);
	InstSelector.reset(createX86InstructionSelector(TM, this, RBI));
	}

	const CallLowering *X86Subtarget::getCallLowering() const {
	return CallLoweringInfo.get();
	}

	const InstructionSelector *X86Subtarget::getInstructionSelector() const {
	return InstSelector.get();
	}

	const LegalizerInfo *X86Subtarget::getLegalizerInfo() const {
	return Legalizer.get();
	}

	const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
	return RegBankInfo.get();
	}

	bool X86Subtarget::enableEarlyIfConversion() const {
	return hasCMov() && X86EarlyIfConv;
	}
	Index: head/contrib/llvm/lib/Target/X86/X86Subtarget.h
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86Subtarget.h (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86Subtarget.h (revision 328817)
	@@ -1,721 +1,735 @@
	//===-- X86Subtarget.h - Define Subtarget for the X86 ----------- C++ ---===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares the X86 specific subclass of TargetSubtargetInfo.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H
	#define LLVM_LIB_TARGET_X86_X86SUBTARGET_H

	#include "X86FrameLowering.h"
	#include "X86ISelLowering.h"
	#include "X86InstrInfo.h"
	#include "X86SelectionDAGInfo.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/CodeGen/GlobalISel/CallLowering.h"
	#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
	#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
	#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/MC/MCInstrItineraries.h"
	#include "llvm/Target/TargetMachine.h"
	#include <memory>

	#define GET_SUBTARGETINFO_HEADER
	#include "X86GenSubtargetInfo.inc"

	namespace llvm {

	class GlobalValue;

	/// The X86 backend supports a number of different styles of PIC.
	///
	namespace PICStyles {

	enum Style {
	StubPIC, // Used on i386-darwin in pic mode.
	GOT, // Used on 32 bit elf on when in pic mode.
	RIPRel, // Used on X86-64 when in pic mode.
	None // Set when not in pic mode.
	};

	} // end namespace PICStyles

	class X86Subtarget final : public X86GenSubtargetInfo {
	public:
	enum X86ProcFamilyEnum {
	Others,
	IntelAtom,
	IntelSLM,
	IntelGLM,
	IntelHaswell,
	IntelBroadwell,
	IntelSkylake,
	IntelKNL,
	IntelSKX,
	IntelCannonlake,
	IntelIcelake,
	};

	protected:
	enum X86SSEEnum {
	NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
	};

	enum X863DNowEnum {
	NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
	};

	/// X86 processor family: Intel Atom, and others
	X86ProcFamilyEnum X86ProcFamily;

	/// Which PIC style to use
	PICStyles::Style PICStyle;

	const TargetMachine &TM;

	/// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
	X86SSEEnum X86SSELevel;

	/// MMX, 3DNow, 3DNow Athlon, or none supported.
	X863DNowEnum X863DNowLevel;

	/// True if the processor supports X87 instructions.
	bool HasX87;

	/// True if this processor has conditional move instructions
	/// (generally pentium pro+).
	bool HasCMov;

	/// True if the processor supports X86-64 instructions.
	bool HasX86_64;

	/// True if the processor supports POPCNT.
	bool HasPOPCNT;

	/// True if the processor supports SSE4A instructions.
	bool HasSSE4A;

	/// Target has AES instructions
	bool HasAES;
	bool HasVAES;

	/// Target has FXSAVE/FXRESTOR instructions
	bool HasFXSR;

	/// Target has XSAVE instructions
	bool HasXSAVE;

	/// Target has XSAVEOPT instructions
	bool HasXSAVEOPT;

	/// Target has XSAVEC instructions
	bool HasXSAVEC;

	/// Target has XSAVES instructions
	bool HasXSAVES;

	/// Target has carry-less multiplication
	bool HasPCLMUL;
	bool HasVPCLMULQDQ;

	/// Target has Galois Field Arithmetic instructions
	bool HasGFNI;

	/// Target has 3-operand fused multiply-add
	bool HasFMA;

	/// Target has 4-operand fused multiply-add
	bool HasFMA4;

	/// Target has XOP instructions
	bool HasXOP;

	/// Target has TBM instructions.
	bool HasTBM;

	/// Target has LWP instructions
	bool HasLWP;

	/// True if the processor has the MOVBE instruction.
	bool HasMOVBE;

	/// True if the processor has the RDRAND instruction.
	bool HasRDRAND;

	/// Processor has 16-bit floating point conversion instructions.
	bool HasF16C;

	/// Processor has FS/GS base insturctions.
	bool HasFSGSBase;

	/// Processor has LZCNT instruction.
	bool HasLZCNT;

	/// Processor has BMI1 instructions.
	bool HasBMI;

	/// Processor has BMI2 instructions.
	bool HasBMI2;

	/// Processor has VBMI instructions.
	bool HasVBMI;

	/// Processor has VBMI2 instructions.
	bool HasVBMI2;

	/// Processor has Integer Fused Multiply Add
	bool HasIFMA;

	/// Processor has RTM instructions.
	bool HasRTM;

	/// Processor has ADX instructions.
	bool HasADX;

	/// Processor has SHA instructions.
	bool HasSHA;

	/// Processor has PRFCHW instructions.
	bool HasPRFCHW;

	/// Processor has RDSEED instructions.
	bool HasRDSEED;

	/// Processor has LAHF/SAHF instructions.
	bool HasLAHFSAHF;

	/// Processor has MONITORX/MWAITX instructions.
	bool HasMWAITX;

	/// Processor has Cache Line Zero instruction
	bool HasCLZERO;

	/// Processor has Prefetch with intent to Write instruction
	bool HasPREFETCHWT1;

	/// True if SHLD instructions are slow.
	bool IsSHLDSlow;

	/// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and
	// PMULUDQ.
	bool IsPMULLDSlow;

	/// True if unaligned memory accesses of 16-bytes are slow.
	bool IsUAMem16Slow;

	/// True if unaligned memory accesses of 32-bytes are slow.
	bool IsUAMem32Slow;

	/// True if SSE operations can have unaligned memory operands.
	/// This may require setting a configuration bit in the processor.
	bool HasSSEUnalignedMem;

	/// True if this processor has the CMPXCHG16B instruction;
	/// this is true for most x86-64 chips, but not the first AMD chips.
	bool HasCmpxchg16b;

	/// True if the LEA instruction should be used for adjusting
	/// the stack pointer. This is an optimization for Intel Atom processors.
	bool UseLeaForSP;

	/// True if its preferable to combine to a single shuffle using a variable
	/// mask over multiple fixed shuffles.
	bool HasFastVariableShuffle;

	/// True if there is no performance penalty to writing only the lower parts
	/// of a YMM or ZMM register without clearing the upper part.
	bool HasFastPartialYMMorZMMWrite;

	/// True if gather is reasonably fast. This is true for Skylake client and
	/// all AVX-512 CPUs.
	bool HasFastGather;

	/// True if hardware SQRTSS instruction is at least as fast (latency) as
	/// RSQRTSS followed by a Newton-Raphson iteration.
	bool HasFastScalarFSQRT;

	/// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
	/// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
	bool HasFastVectorFSQRT;

	/// True if 8-bit divisions are significantly faster than
	/// 32-bit divisions and should be used when possible.
	bool HasSlowDivide32;

	/// True if 32-bit divides are significantly faster than
	/// 64-bit divisions and should be used when possible.
	bool HasSlowDivide64;

	/// True if LZCNT instruction is fast.
	bool HasFastLZCNT;

	/// True if SHLD based rotate is fast.
	bool HasFastSHLDRotate;

	/// True if the processor supports macrofusion.
	bool HasMacroFusion;

	/// True if the processor has enhanced REP MOVSB/STOSB.
	bool HasERMSB;

	/// True if the short functions should be padded to prevent
	/// a stall when returning too early.
	bool PadShortFunctions;

	/// True if two memory operand instructions should use a temporary register
	/// instead.
	bool SlowTwoMemOps;

	/// True if the LEA instruction inputs have to be ready at address generation
	/// (AG) time.
	bool LEAUsesAG;

	/// True if the LEA instruction with certain arguments is slow
	bool SlowLEA;

	/// True if the LEA instruction has all three source operands: base, index,
	/// and offset or if the LEA instruction uses base and index registers where
	/// the base is EBP, RBP,or R13
	bool Slow3OpsLEA;

	/// True if INC and DEC instructions are slow when writing to flags
	bool SlowIncDec;

	/// Processor has AVX-512 PreFetch Instructions
	bool HasPFI;

	/// Processor has AVX-512 Exponential and Reciprocal Instructions
	bool HasERI;

	/// Processor has AVX-512 Conflict Detection Instructions
	bool HasCDI;

	/// Processor has AVX-512 population count Instructions
	bool HasVPOPCNTDQ;

	/// Processor has AVX-512 Doubleword and Quadword instructions
	bool HasDQI;

	/// Processor has AVX-512 Byte and Word instructions
	bool HasBWI;

	/// Processor has AVX-512 Vector Length eXtenstions
	bool HasVLX;

	/// Processor has PKU extenstions
	bool HasPKU;

	/// Processor has AVX-512 Vector Neural Network Instructions
	bool HasVNNI;

	/// Processor has AVX-512 Bit Algorithms instructions
	bool HasBITALG;

	/// Processor supports MPX - Memory Protection Extensions
	bool HasMPX;

	/// Processor supports CET SHSTK - Control-Flow Enforcement Technology
	/// using Shadow Stack
	bool HasSHSTK;

	/// Processor supports CET IBT - Control-Flow Enforcement Technology
	/// using Indirect Branch Tracking
	bool HasIBT;

	/// Processor has Software Guard Extensions
	bool HasSGX;

	/// Processor supports Flush Cache Line instruction
	bool HasCLFLUSHOPT;

	/// Processor supports Cache Line Write Back instruction
	bool HasCLWB;

	+ /// Use a retpoline thunk rather than indirect calls to block speculative
	+ /// execution.
	+ bool UseRetpoline;
	+
	+ /// When using a retpoline thunk, call an externally provided thunk rather
	+ /// than emitting one inside the compiler.
	+ bool UseRetpolineExternalThunk;
	+
	/// Use software floating point for code generation.
	bool UseSoftFloat;

	/// The minimum alignment known to hold of the stack frame on
	/// entry to the function and which must be maintained by every function.
	unsigned stackAlignment;

	/// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
	///
	unsigned MaxInlineSizeThreshold;

	/// What processor and OS we're targeting.
	Triple TargetTriple;

	/// Instruction itineraries for scheduling
	InstrItineraryData InstrItins;

	/// GlobalISel related APIs.
	std::unique_ptr<CallLowering> CallLoweringInfo;
	std::unique_ptr<LegalizerInfo> Legalizer;
	std::unique_ptr<RegisterBankInfo> RegBankInfo;
	std::unique_ptr<InstructionSelector> InstSelector;

	private:
	/// Override the stack alignment.
	unsigned StackAlignOverride;

	/// True if compiling for 64-bit, false for 16-bit or 32-bit.
	bool In64BitMode;

	/// True if compiling for 32-bit, false for 16-bit or 64-bit.
	bool In32BitMode;

	/// True if compiling for 16-bit, false for 32-bit or 64-bit.
	bool In16BitMode;

	/// Contains the Overhead of gather\scatter instructions
	int GatherOverhead;
	int ScatterOverhead;

	X86SelectionDAGInfo TSInfo;
	// Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
	// X86TargetLowering needs.
	X86InstrInfo InstrInfo;
	X86TargetLowering TLInfo;
	X86FrameLowering FrameLowering;

	public:
	/// This constructor initializes the data members to match that
	/// of the specified triple.
	///
	X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
	const X86TargetMachine &TM, unsigned StackAlignOverride);

	const X86TargetLowering *getTargetLowering() const override {
	return &TLInfo;
	}

	const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }

	const X86FrameLowering *getFrameLowering() const override {
	return &FrameLowering;
	}

	const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
	return &TSInfo;
	}

	const X86RegisterInfo *getRegisterInfo() const override {
	return &getInstrInfo()->getRegisterInfo();
	}

	/// Returns the minimum alignment known to hold of the
	/// stack frame on entry to the function and which must be maintained by every
	/// function for this subtarget.
	unsigned getStackAlignment() const { return stackAlignment; }

	/// Returns the maximum memset / memcpy size
	/// that still makes it profitable to inline the call.
	unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }

	/// ParseSubtargetFeatures - Parses features string setting specified
	/// subtarget options. Definition of function is auto generated by tblgen.
	void ParseSubtargetFeatures(StringRef CPU, StringRef FS);

	/// Methods used by Global ISel
	const CallLowering *getCallLowering() const override;
	const InstructionSelector *getInstructionSelector() const override;
	const LegalizerInfo *getLegalizerInfo() const override;
	const RegisterBankInfo *getRegBankInfo() const override;

	private:
	/// Initialize the full set of dependencies so we can use an initializer
	/// list for X86Subtarget.
	X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
	void initializeEnvironment();
	void initSubtargetFeatures(StringRef CPU, StringRef FS);

	public:
	/// Is this x86_64? (disregarding specific ABI / programming model)
	bool is64Bit() const {
	return In64BitMode;
	}

	bool is32Bit() const {
	return In32BitMode;
	}

	bool is16Bit() const {
	return In16BitMode;
	}

	/// Is this x86_64 with the ILP32 programming model (x32 ABI)?
	bool isTarget64BitILP32() const {
	return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 \|\|
	TargetTriple.isOSNaCl());
	}

	/// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
	bool isTarget64BitLP64() const {
	return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 &&
	!TargetTriple.isOSNaCl());
	}

	PICStyles::Style getPICStyle() const { return PICStyle; }
	void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }

	bool hasX87() const { return HasX87; }
	bool hasCMov() const { return HasCMov; }
	bool hasSSE1() const { return X86SSELevel >= SSE1; }
	bool hasSSE2() const { return X86SSELevel >= SSE2; }
	bool hasSSE3() const { return X86SSELevel >= SSE3; }
	bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
	bool hasSSE41() const { return X86SSELevel >= SSE41; }
	bool hasSSE42() const { return X86SSELevel >= SSE42; }
	bool hasAVX() const { return X86SSELevel >= AVX; }
	bool hasAVX2() const { return X86SSELevel >= AVX2; }
	bool hasAVX512() const { return X86SSELevel >= AVX512F; }
	bool hasFp256() const { return hasAVX(); }
	bool hasInt256() const { return hasAVX2(); }
	bool hasSSE4A() const { return HasSSE4A; }
	bool hasMMX() const { return X863DNowLevel >= MMX; }
	bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
	bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
	bool hasPOPCNT() const { return HasPOPCNT; }
	bool hasAES() const { return HasAES; }
	bool hasVAES() const { return HasVAES; }
	bool hasFXSR() const { return HasFXSR; }
	bool hasXSAVE() const { return HasXSAVE; }
	bool hasXSAVEOPT() const { return HasXSAVEOPT; }
	bool hasXSAVEC() const { return HasXSAVEC; }
	bool hasXSAVES() const { return HasXSAVES; }
	bool hasPCLMUL() const { return HasPCLMUL; }
	bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; }
	bool hasGFNI() const { return HasGFNI; }
	// Prefer FMA4 to FMA - its better for commutation/memory folding and
	// has equal or better performance on all supported targets.
	bool hasFMA() const { return HasFMA; }
	bool hasFMA4() const { return HasFMA4; }
	bool hasAnyFMA() const { return hasFMA() \|\| hasFMA4(); }
	bool hasXOP() const { return HasXOP; }
	bool hasTBM() const { return HasTBM; }
	bool hasLWP() const { return HasLWP; }
	bool hasMOVBE() const { return HasMOVBE; }
	bool hasRDRAND() const { return HasRDRAND; }
	bool hasF16C() const { return HasF16C; }
	bool hasFSGSBase() const { return HasFSGSBase; }
	bool hasLZCNT() const { return HasLZCNT; }
	bool hasBMI() const { return HasBMI; }
	bool hasBMI2() const { return HasBMI2; }
	bool hasVBMI() const { return HasVBMI; }
	bool hasVBMI2() const { return HasVBMI2; }
	bool hasIFMA() const { return HasIFMA; }
	bool hasRTM() const { return HasRTM; }
	bool hasADX() const { return HasADX; }
	bool hasSHA() const { return HasSHA; }
	bool hasPRFCHW() const { return HasPRFCHW \|\| HasPREFETCHWT1; }
	bool hasPREFETCHWT1() const { return HasPREFETCHWT1; }
	bool hasSSEPrefetch() const {
	// We implicitly enable these when we have a write prefix supporting cache
	// level OR if we have prfchw, but don't already have a read prefetch from
	// 3dnow.
	return hasSSE1() \|\| (hasPRFCHW() && !has3DNow()) \|\| hasPREFETCHWT1();
	}
	bool hasRDSEED() const { return HasRDSEED; }
	bool hasLAHFSAHF() const { return HasLAHFSAHF; }
	bool hasMWAITX() const { return HasMWAITX; }
	bool hasCLZERO() const { return HasCLZERO; }
	bool isSHLDSlow() const { return IsSHLDSlow; }
	bool isPMULLDSlow() const { return IsPMULLDSlow; }
	bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
	bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
	int getGatherOverhead() const { return GatherOverhead; }
	int getScatterOverhead() const { return ScatterOverhead; }
	bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
	bool hasCmpxchg16b() const { return HasCmpxchg16b; }
	bool useLeaForSP() const { return UseLeaForSP; }
	bool hasFastVariableShuffle() const {
	return HasFastVariableShuffle;
	}
	bool hasFastPartialYMMorZMMWrite() const {
	return HasFastPartialYMMorZMMWrite;
	}
	bool hasFastGather() const { return HasFastGather; }
	bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
	bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
	bool hasFastLZCNT() const { return HasFastLZCNT; }
	bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
	bool hasMacroFusion() const { return HasMacroFusion; }
	bool hasERMSB() const { return HasERMSB; }
	bool hasSlowDivide32() const { return HasSlowDivide32; }
	bool hasSlowDivide64() const { return HasSlowDivide64; }
	bool padShortFunctions() const { return PadShortFunctions; }
	bool slowTwoMemOps() const { return SlowTwoMemOps; }
	bool LEAusesAG() const { return LEAUsesAG; }
	bool slowLEA() const { return SlowLEA; }
	bool slow3OpsLEA() const { return Slow3OpsLEA; }
	bool slowIncDec() const { return SlowIncDec; }
	bool hasCDI() const { return HasCDI; }
	bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; }
	bool hasPFI() const { return HasPFI; }
	bool hasERI() const { return HasERI; }
	bool hasDQI() const { return HasDQI; }
	bool hasBWI() const { return HasBWI; }
	bool hasVLX() const { return HasVLX; }
	bool hasPKU() const { return HasPKU; }
	bool hasVNNI() const { return HasVNNI; }
	bool hasBITALG() const { return HasBITALG; }
	bool hasMPX() const { return HasMPX; }
	bool hasSHSTK() const { return HasSHSTK; }
	bool hasIBT() const { return HasIBT; }
	bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
	bool hasCLWB() const { return HasCLWB; }
	+ bool useRetpoline() const { return UseRetpoline; }
	+ bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }

	bool isXRaySupported() const override { return is64Bit(); }

	X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; }

	/// TODO: to be removed later and replaced with suitable properties
	bool isAtom() const { return X86ProcFamily == IntelAtom; }
	bool isSLM() const { return X86ProcFamily == IntelSLM; }
	bool useSoftFloat() const { return UseSoftFloat; }

	/// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
	/// no-sse2). There isn't any reason to disable it if the target processor
	/// supports it.
	bool hasMFence() const { return hasSSE2() \|\| is64Bit(); }

	const Triple &getTargetTriple() const { return TargetTriple; }

	bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
	bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
	bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); }
	bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); }
	bool isTargetPS4() const { return TargetTriple.isPS4CPU(); }

	bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
	bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
	bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }

	bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
	bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); }
	bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); }
	bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
	bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
	bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
	bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
	bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
	bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }

	bool isTargetWindowsMSVC() const {
	return TargetTriple.isWindowsMSVCEnvironment();
	}

	bool isTargetKnownWindowsMSVC() const {
	return TargetTriple.isKnownWindowsMSVCEnvironment();
	}

	bool isTargetWindowsCoreCLR() const {
	return TargetTriple.isWindowsCoreCLREnvironment();
	}

	bool isTargetWindowsCygwin() const {
	return TargetTriple.isWindowsCygwinEnvironment();
	}

	bool isTargetWindowsGNU() const {
	return TargetTriple.isWindowsGNUEnvironment();
	}

	bool isTargetWindowsItanium() const {
	return TargetTriple.isWindowsItaniumEnvironment();
	}

	bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }

	bool isOSWindows() const { return TargetTriple.isOSWindows(); }

	bool isTargetWin64() const { return In64BitMode && isOSWindows(); }

	bool isTargetWin32() const { return !In64BitMode && isOSWindows(); }

	bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
	bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }

	bool isPICStyleStubPIC() const {
	return PICStyle == PICStyles::StubPIC;
	}

	bool isPositionIndependent() const { return TM.isPositionIndependent(); }

	bool isCallingConvWin64(CallingConv::ID CC) const {
	switch (CC) {
	// On Win64, all these conventions just use the default convention.
	case CallingConv::C:
	case CallingConv::Fast:
	case CallingConv::Swift:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_VectorCall:
	case CallingConv::Intel_OCL_BI:
	return isTargetWin64();
	// This convention allows using the Win64 convention on other targets.
	case CallingConv::Win64:
	return true;
	// This convention allows using the SysV convention on Windows targets.
	case CallingConv::X86_64_SysV:
	return false;
	// Otherwise, who knows what this is.
	default:
	return false;
	}
	}

	/// Classify a global variable reference for the current subtarget according
	/// to how we should reference it in a non-pcrel context.
	unsigned char classifyLocalReference(const GlobalValue *GV) const;

	unsigned char classifyGlobalReference(const GlobalValue *GV,
	const Module &M) const;
	unsigned char classifyGlobalReference(const GlobalValue *GV) const;

	/// Classify a global function reference for the current subtarget.
	unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
	const Module &M) const;
	unsigned char classifyGlobalFunctionReference(const GlobalValue *GV) const;

	/// Classify a blockaddress reference for the current subtarget according to
	/// how we should reference it in a non-pcrel context.
	unsigned char classifyBlockAddressReference() const;

	/// Return true if the subtarget allows calls to immediate address.
	bool isLegalToCallImmediateAddr() const;
	+
	+ /// If we are using retpolines, we need to expand indirectbr to avoid it
	+ /// lowering to an actual indirect jump.
	+ bool enableIndirectBrExpand() const override { return useRetpoline(); }

	/// Enable the MachineScheduler pass for all X86 subtargets.
	bool enableMachineScheduler() const override { return true; }

	// TODO: Update the regression tests and return true.
	bool supportPrintSchedInfo() const override { return false; }

	bool enableEarlyIfConversion() const override;

	/// Return the instruction itineraries based on the subtarget selection.
	const InstrItineraryData *getInstrItineraryData() const override {
	return &InstrItins;
	}

	AntiDepBreakMode getAntiDepBreakMode() const override {
	return TargetSubtargetInfo::ANTIDEP_CRITICAL;
	}

	bool enableAdvancedRASplitCost() const override { return true; }
	};

	} // end namespace llvm

	#endif // LLVM_LIB_TARGET_X86_X86SUBTARGET_H
	Index: head/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp (revision 328817)
	@@ -1,438 +1,448 @@
	//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the X86 specific subclass of TargetMachine.
	//
	//===----------------------------------------------------------------------===//

	#include "X86TargetMachine.h"
	#include "MCTargetDesc/X86MCTargetDesc.h"
	#include "X86.h"
	#include "X86CallLowering.h"
	#include "X86LegalizerInfo.h"
	#include "X86MacroFusion.h"
	#include "X86Subtarget.h"
	#include "X86TargetObjectFile.h"
	#include "X86TargetTransformInfo.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/CodeGen/ExecutionDepsFix.h"
	#include "llvm/CodeGen/GlobalISel/CallLowering.h"
	#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
	#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
	#include "llvm/CodeGen/GlobalISel/Legalizer.h"
	#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
	#include "llvm/CodeGen/MachineScheduler.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/CodeGen/TargetLoweringObjectFile.h"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/Function.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Target/TargetOptions.h"
	#include <memory>
	#include <string>

	using namespace llvm;

	static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
	cl::desc("Enable the machine combiner pass"),
	cl::init(true), cl::Hidden);

	namespace llvm {

	void initializeWinEHStatePassPass(PassRegistry &);
	void initializeFixupLEAPassPass(PassRegistry &);
	void initializeX86CallFrameOptimizationPass(PassRegistry &);
	void initializeX86CmovConverterPassPass(PassRegistry &);
	void initializeX86ExecutionDepsFixPass(PassRegistry &);
	void initializeX86DomainReassignmentPass(PassRegistry &);

	} // end namespace llvm

	extern "C" void LLVMInitializeX86Target() {
	// Register the target.
	RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
	RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());

	PassRegistry &PR = *PassRegistry::getPassRegistry();
	initializeGlobalISel(PR);
	initializeWinEHStatePassPass(PR);
	initializeFixupBWInstPassPass(PR);
	initializeEvexToVexInstPassPass(PR);
	initializeFixupLEAPassPass(PR);
	initializeX86CallFrameOptimizationPass(PR);
	initializeX86CmovConverterPassPass(PR);
	initializeX86ExecutionDepsFixPass(PR);
	initializeX86DomainReassignmentPass(PR);
	}

	static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
	if (TT.isOSBinFormatMachO()) {
	if (TT.getArch() == Triple::x86_64)
	return llvm::make_unique<X86_64MachoTargetObjectFile>();
	return llvm::make_unique<TargetLoweringObjectFileMachO>();
	}

	if (TT.isOSFreeBSD())
	return llvm::make_unique<X86FreeBSDTargetObjectFile>();
	if (TT.isOSLinux() \|\| TT.isOSNaCl() \|\| TT.isOSIAMCU())
	return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
	if (TT.isOSSolaris())
	return llvm::make_unique<X86SolarisTargetObjectFile>();
	if (TT.isOSFuchsia())
	return llvm::make_unique<X86FuchsiaTargetObjectFile>();
	if (TT.isOSBinFormatELF())
	return llvm::make_unique<X86ELFTargetObjectFile>();
	if (TT.isKnownWindowsMSVCEnvironment() \|\| TT.isWindowsCoreCLREnvironment())
	return llvm::make_unique<X86WindowsTargetObjectFile>();
	if (TT.isOSBinFormatCOFF())
	return llvm::make_unique<TargetLoweringObjectFileCOFF>();
	llvm_unreachable("unknown subtarget type");
	}

	static std::string computeDataLayout(const Triple &TT) {
	// X86 is little endian
	std::string Ret = "e";

	Ret += DataLayout::getManglingComponent(TT);
	// X86 and x32 have 32 bit pointers.
	if ((TT.isArch64Bit() &&
	(TT.getEnvironment() == Triple::GNUX32 \|\| TT.isOSNaCl())) \|\|
	!TT.isArch64Bit())
	Ret += "-p:32:32";

	// Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
	if (TT.isArch64Bit() \|\| TT.isOSWindows() \|\| TT.isOSNaCl())
	Ret += "-i64:64";
	else if (TT.isOSIAMCU())
	Ret += "-i64:32-f64:32";
	else
	Ret += "-f64:32:64";

	// Some ABIs align long double to 128 bits, others to 32.
	if (TT.isOSNaCl() \|\| TT.isOSIAMCU())
	; // No f80
	else if (TT.isArch64Bit() \|\| TT.isOSDarwin())
	Ret += "-f80:128";
	else
	Ret += "-f80:32";

	if (TT.isOSIAMCU())
	Ret += "-f128:32";

	// The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
	if (TT.isArch64Bit())
	Ret += "-n8:16:32:64";
	else
	Ret += "-n8:16:32";

	// The stack is aligned to 32 bits on some ABIs and 128 bits on others.
	if ((!TT.isArch64Bit() && TT.isOSWindows()) \|\| TT.isOSIAMCU())
	Ret += "-a:0:32-S32";
	else
	Ret += "-S128";

	return Ret;
	}

	static Reloc::Model getEffectiveRelocModel(const Triple &TT,
	Optional<Reloc::Model> RM) {
	bool is64Bit = TT.getArch() == Triple::x86_64;
	if (!RM.hasValue()) {
	// Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
	// Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
	// use static relocation model by default.
	if (TT.isOSDarwin()) {
	if (is64Bit)
	return Reloc::PIC_;
	return Reloc::DynamicNoPIC;
	}
	if (TT.isOSWindows() && is64Bit)
	return Reloc::PIC_;
	return Reloc::Static;
	}

	// ELF and X86-64 don't have a distinct DynamicNoPIC model. DynamicNoPIC
	// is defined as a model for code which may be used in static or dynamic
	// executables but not necessarily a shared library. On X86-32 we just
	// compile in -static mode, in x86-64 we use PIC.
	if (*RM == Reloc::DynamicNoPIC) {
	if (is64Bit)
	return Reloc::PIC_;
	if (!TT.isOSDarwin())
	return Reloc::Static;
	}

	// If we are on Darwin, disallow static relocation model in X86-64 mode, since
	// the Mach-O file format doesn't support it.
	if (*RM == Reloc::Static && TT.isOSDarwin() && is64Bit)
	return Reloc::PIC_;

	return *RM;
	}

	static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM,
	bool JIT, bool Is64Bit) {
	if (CM)
	return *CM;
	if (JIT)
	return Is64Bit ? CodeModel::Large : CodeModel::Small;
	return CodeModel::Small;
	}

	/// Create an X86 target.
	///
	X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
	StringRef CPU, StringRef FS,
	const TargetOptions &Options,
	Optional<Reloc::Model> RM,
	Optional<CodeModel::Model> CM,
	CodeGenOpt::Level OL, bool JIT)
	: LLVMTargetMachine(
	T, computeDataLayout(TT), TT, CPU, FS, Options,
	getEffectiveRelocModel(TT, RM),
	getEffectiveCodeModel(CM, JIT, TT.getArch() == Triple::x86_64), OL),
	TLOF(createTLOF(getTargetTriple())) {
	// Windows stack unwinder gets confused when execution flow "falls through"
	// after a call to 'noreturn' function.
	// To prevent that, we emit a trap for 'unreachable' IR instructions.
	// (which on X86, happens to be the 'ud2' instruction)
	// On PS4, the "return address" of a 'noreturn' call must still be within
	// the calling function, and TrapUnreachable is an easy way to get that.
	// The check here for 64-bit windows is a bit icky, but as we're unlikely
	// to ever want to mix 32 and 64-bit windows code in a single module
	// this should be fine.
	if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) \|\| TT.isPS4())
	this->Options.TrapUnreachable = true;

	initAsmInfo();
	}

	X86TargetMachine::~X86TargetMachine() = default;

	const X86Subtarget *
	X86TargetMachine::getSubtargetImpl(const Function &F) const {
	Attribute CPUAttr = F.getFnAttribute("target-cpu");
	Attribute FSAttr = F.getFnAttribute("target-features");

	StringRef CPU = !CPUAttr.hasAttribute(Attribute::None)
	? CPUAttr.getValueAsString()
	: (StringRef)TargetCPU;
	StringRef FS = !FSAttr.hasAttribute(Attribute::None)
	? FSAttr.getValueAsString()
	: (StringRef)TargetFS;

	SmallString<512> Key;
	Key.reserve(CPU.size() + FS.size());
	Key += CPU;
	Key += FS;

	// FIXME: This is related to the code below to reset the target options,
	// we need to know whether or not the soft float flag is set on the
	// function before we can generate a subtarget. We also need to use
	// it as a key for the subtarget since that can be the only difference
	// between two functions.
	bool SoftFloat =
	F.getFnAttribute("use-soft-float").getValueAsString() == "true";
	// If the soft float attribute is set on the function turn on the soft float
	// subtarget feature.
	if (SoftFloat)
	Key += FS.empty() ? "+soft-float" : ",+soft-float";

	FS = Key.substr(CPU.size());

	auto &I = SubtargetMap[Key];
	if (!I) {
	// This needs to be done before we create a new subtarget since any
	// creation will depend on the TM and the code generation flags on the
	// function that reside in TargetOptions.
	resetTargetOptions(F);
	I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
	Options.StackAlignmentOverride);
	}
	return I.get();
	}

	//===----------------------------------------------------------------------===//
	// Command line options for x86
	//===----------------------------------------------------------------------===//
	static cl::opt<bool>
	UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
	cl::desc("Minimize AVX to SSE transition penalty"),
	cl::init(true));

	//===----------------------------------------------------------------------===//
	// X86 TTI query.
	//===----------------------------------------------------------------------===//

	TargetTransformInfo
	X86TargetMachine::getTargetTransformInfo(const Function &F) {
	return TargetTransformInfo(X86TTIImpl(this, F));
	}

	//===----------------------------------------------------------------------===//
	// Pass Pipeline Configuration
	//===----------------------------------------------------------------------===//

	namespace {

	/// X86 Code Generator Pass Configuration Options.
	class X86PassConfig : public TargetPassConfig {
	public:
	X86PassConfig(X86TargetMachine &TM, PassManagerBase &PM)
	: TargetPassConfig(TM, PM) {}

	X86TargetMachine &getX86TargetMachine() const {
	return getTM<X86TargetMachine>();
	}

	ScheduleDAGInstrs *
	createMachineScheduler(MachineSchedContext *C) const override {
	ScheduleDAGMILive *DAG = createGenericSchedLive(C);
	DAG->addMutation(createX86MacroFusionDAGMutation());
	return DAG;
	}

	void addIRPasses() override;
	bool addInstSelector() override;
	bool addIRTranslator() override;
	bool addLegalizeMachineIR() override;
	bool addRegBankSelect() override;
	bool addGlobalInstructionSelect() override;
	bool addILPOpts() override;
	bool addPreISel() override;
	void addMachineSSAOptimization() override;
	void addPreRegAlloc() override;
	void addPostRegAlloc() override;
	void addPreEmitPass() override;
	+ void addPreEmitPass2() override;
	void addPreSched2() override;
	};

	class X86ExecutionDepsFix : public ExecutionDepsFix {
	public:
	static char ID;
	X86ExecutionDepsFix() : ExecutionDepsFix(ID, X86::VR128XRegClass) {}
	StringRef getPassName() const override {
	return "X86 Execution Dependency Fix";
	}
	};
	char X86ExecutionDepsFix::ID;

	} // end anonymous namespace

	INITIALIZE_PASS(X86ExecutionDepsFix, "x86-execution-deps-fix",
	"X86 Execution Dependency Fix", false, false)

	TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
	return new X86PassConfig(*this, PM);
	}

	void X86PassConfig::addIRPasses() {
	addPass(createAtomicExpandPass());

	TargetPassConfig::addIRPasses();

	if (TM->getOptLevel() != CodeGenOpt::None)
	addPass(createInterleavedAccessPass());
	+
	+ // Add passes that handle indirect branch removal and insertion of a retpoline
	+ // thunk. These will be a no-op unless a function subtarget has the retpoline
	+ // feature enabled.
	+ addPass(createIndirectBrExpandPass());
	}

	bool X86PassConfig::addInstSelector() {
	// Install an instruction selector.
	addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));

	// For ELF, cleanup any local-dynamic TLS accesses.
	if (TM->getTargetTriple().isOSBinFormatELF() &&
	getOptLevel() != CodeGenOpt::None)
	addPass(createCleanupLocalDynamicTLSPass());

	addPass(createX86GlobalBaseRegPass());
	return false;
	}

	bool X86PassConfig::addIRTranslator() {
	addPass(new IRTranslator());
	return false;
	}

	bool X86PassConfig::addLegalizeMachineIR() {
	addPass(new Legalizer());
	return false;
	}

	bool X86PassConfig::addRegBankSelect() {
	addPass(new RegBankSelect());
	return false;
	}

	bool X86PassConfig::addGlobalInstructionSelect() {
	addPass(new InstructionSelect());
	return false;
	}

	bool X86PassConfig::addILPOpts() {
	addPass(&EarlyIfConverterID);
	if (EnableMachineCombinerPass)
	addPass(&MachineCombinerID);
	addPass(createX86CmovConverterPass());
	return true;
	}

	bool X86PassConfig::addPreISel() {
	// Only add this pass for 32-bit x86 Windows.
	const Triple &TT = TM->getTargetTriple();
	if (TT.isOSWindows() && TT.getArch() == Triple::x86)
	addPass(createX86WinEHStatePass());
	return true;
	}

	void X86PassConfig::addPreRegAlloc() {
	if (getOptLevel() != CodeGenOpt::None) {
	addPass(&LiveRangeShrinkID);
	addPass(createX86FixupSetCC());
	addPass(createX86OptimizeLEAs());
	addPass(createX86CallFrameOptimization());
	}

	addPass(createX86WinAllocaExpander());
	}
	void X86PassConfig::addMachineSSAOptimization() {
	addPass(createX86DomainReassignmentPass());
	TargetPassConfig::addMachineSSAOptimization();
	}

	void X86PassConfig::addPostRegAlloc() {
	addPass(createX86FloatingPointStackifierPass());
	}

	void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }

	void X86PassConfig::addPreEmitPass() {
	if (getOptLevel() != CodeGenOpt::None)
	addPass(new X86ExecutionDepsFix());

	if (UseVZeroUpper)
	addPass(createX86IssueVZeroUpperPass());

	if (getOptLevel() != CodeGenOpt::None) {
	addPass(createX86FixupBWInsts());
	addPass(createX86PadShortFunctions());
	addPass(createX86FixupLEAs());
	addPass(createX86EvexToVexInsts());
	}
	+}
	+
	+void X86PassConfig::addPreEmitPass2() {
	+ addPass(createX86RetpolineThunksPass());
	}
	Index: head/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
	===================================================================
	--- head/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp (revision 328817)
	@@ -1,1359 +1,1360 @@
	//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements a trivial dead store elimination that only considers
	// basic-block local redundant stores.
	//
	// FIXME: This should eventually be extended to be a post-dominator tree
	// traversal. Doing so would be pretty trivial.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/CaptureTracking.h"
	#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/Analysis/MemoryBuiltins.h"
	#include "llvm/Analysis/MemoryDependenceAnalysis.h"
	#include "llvm/Analysis/MemoryLocation.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/IR/Argument.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/PassManager.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Scalar.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <cstddef>
	#include <iterator>
	#include <map>
	#include <utility>

	using namespace llvm;

	#define DEBUG_TYPE "dse"

	STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
	STATISTIC(NumFastStores, "Number of stores deleted");
	STATISTIC(NumFastOther , "Number of other instrs removed");
	STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
	STATISTIC(NumModifiedStores, "Number of stores modified");

	static cl::opt<bool>
	EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
	cl::init(true), cl::Hidden,
	cl::desc("Enable partial-overwrite tracking in DSE"));

	static cl::opt<bool>
	EnablePartialStoreMerging("enable-dse-partial-store-merging",
	cl::init(true), cl::Hidden,
	cl::desc("Enable partial store merging in DSE"));

	//===----------------------------------------------------------------------===//
	// Helper functions
	//===----------------------------------------------------------------------===//
	using OverlapIntervalsTy = std::map<int64_t, int64_t>;
	using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;

	/// Delete this instruction. Before we do, go through and zero out all the
	/// operands of this instruction. If any of them become dead, delete them and
	/// the computation tree that feeds them.
	/// If ValueSet is non-null, remove any deleted instructions from it as well.
	static void
	deleteDeadInstruction(Instruction I, BasicBlock::iterator BBI,
	MemoryDependenceResults &MD, const TargetLibraryInfo &TLI,
	InstOverlapIntervalsTy &IOL,
	DenseMap<Instruction, size_t> InstrOrdering,
	SmallSetVector<Value , 16> ValueSet = nullptr) {
	SmallVector<Instruction*, 32> NowDeadInsts;

	NowDeadInsts.push_back(I);
	--NumFastOther;

	// Keeping the iterator straight is a pain, so we let this routine tell the
	// caller what the next instruction is after we're done mucking about.
	BasicBlock::iterator NewIter = *BBI;

	// Before we touch this instruction, remove it from memdep!
	do {
	Instruction *DeadInst = NowDeadInsts.pop_back_val();
	++NumFastOther;

	// This instruction is dead, zap it, in stages. Start by removing it from
	// MemDep, which needs to know the operands and needs it to be in the
	// function.
	MD.removeInstruction(DeadInst);

	for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
	Value *Op = DeadInst->getOperand(op);
	DeadInst->setOperand(op, nullptr);

	// If this operand just became dead, add it to the NowDeadInsts list.
	if (!Op->use_empty()) continue;

	if (Instruction *OpI = dyn_cast<Instruction>(Op))
	if (isInstructionTriviallyDead(OpI, &TLI))
	NowDeadInsts.push_back(OpI);
	}

	if (ValueSet) ValueSet->remove(DeadInst);
	InstrOrdering->erase(DeadInst);
	IOL.erase(DeadInst);

	if (NewIter == DeadInst->getIterator())
	NewIter = DeadInst->eraseFromParent();
	else
	DeadInst->eraseFromParent();
	} while (!NowDeadInsts.empty());
	*BBI = NewIter;
	}

	/// Does this instruction write some memory? This only returns true for things
	/// that we can analyze with other helpers below.
	static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
	if (isa<StoreInst>(I))
	return true;
	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	switch (II->getIntrinsicID()) {
	default:
	return false;
	case Intrinsic::memset:
	case Intrinsic::memmove:
	case Intrinsic::memcpy:
	case Intrinsic::init_trampoline:
	case Intrinsic::lifetime_end:
	return true;
	}
	}
	if (auto CS = CallSite(I)) {
	if (Function *F = CS.getCalledFunction()) {
	StringRef FnName = F->getName();
	if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy))
	return true;
	if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy))
	return true;
	if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat))
	return true;
	if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat))
	return true;
	}
	}
	return false;
	}

	/// Return a Location stored to by the specified instruction. If isRemovable
	/// returns true, this function and getLocForRead completely describe the memory
	/// operations for this instruction.
	static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
	if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
	return MemoryLocation::get(SI);

	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {
	// memcpy/memmove/memset.
	MemoryLocation Loc = MemoryLocation::getForDest(MI);
	return Loc;
	}

	IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
	if (!II)
	return MemoryLocation();

	switch (II->getIntrinsicID()) {
	default:
	return MemoryLocation(); // Unhandled intrinsic.
	case Intrinsic::init_trampoline:
	// FIXME: We don't know the size of the trampoline, so we can't really
	// handle it here.
	return MemoryLocation(II->getArgOperand(0));
	case Intrinsic::lifetime_end: {
	uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
	return MemoryLocation(II->getArgOperand(1), Len);
	}
	}
	}

	/// Return the location read by the specified "hasMemoryWrite" instruction if
	/// any.
	static MemoryLocation getLocForRead(Instruction *Inst,
	const TargetLibraryInfo &TLI) {
	assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case");

	// The only instructions that both read and write are the mem transfer
	// instructions (memcpy/memmove).
	if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst))
	return MemoryLocation::getForSource(MTI);
	return MemoryLocation();
	}

	/// If the value of this instruction and the memory it writes to is unused, may
	/// we delete this instruction?
	static bool isRemovable(Instruction *I) {
	// Don't remove volatile/atomic stores.
	if (StoreInst *SI = dyn_cast<StoreInst>(I))
	return SI->isUnordered();

	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	switch (II->getIntrinsicID()) {
	default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate");
	case Intrinsic::lifetime_end:
	// Never remove dead lifetime_end's, e.g. because it is followed by a
	// free.
	return false;
	case Intrinsic::init_trampoline:
	// Always safe to remove init_trampoline.
	return true;
	case Intrinsic::memset:
	case Intrinsic::memmove:
	case Intrinsic::memcpy:
	// Don't remove volatile memory intrinsics.
	return !cast<MemIntrinsic>(II)->isVolatile();
	}
	}

	if (auto CS = CallSite(I))
	return CS.getInstruction()->use_empty();

	return false;
	}

	/// Returns true if the end of this instruction can be safely shortened in
	/// length.
	static bool isShortenableAtTheEnd(Instruction *I) {
	// Don't shorten stores for now
	if (isa<StoreInst>(I))
	return false;

	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	switch (II->getIntrinsicID()) {
	default: return false;
	case Intrinsic::memset:
	case Intrinsic::memcpy:
	// Do shorten memory intrinsics.
	// FIXME: Add memmove if it's also safe to transform.
	return true;
	}
	}

	// Don't shorten libcalls calls for now.

	return false;
	}

	/// Returns true if the beginning of this instruction can be safely shortened
	/// in length.
	static bool isShortenableAtTheBeginning(Instruction *I) {
	// FIXME: Handle only memset for now. Supporting memcpy/memmove should be
	// easily done by offsetting the source address.
	IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
	return II && II->getIntrinsicID() == Intrinsic::memset;
	}

	/// Return the pointer that is being written to.
	static Value getStoredPointerOperand(Instruction I) {
	if (StoreInst *SI = dyn_cast<StoreInst>(I))
	return SI->getPointerOperand();
	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
	return MI->getDest();

	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	switch (II->getIntrinsicID()) {
	default: llvm_unreachable("Unexpected intrinsic!");
	case Intrinsic::init_trampoline:
	return II->getArgOperand(0);
	}
	}

	CallSite CS(I);
	// All the supported functions so far happen to have dest as their first
	// argument.
	return CS.getArgument(0);
	}

	static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
	const TargetLibraryInfo &TLI) {
	uint64_t Size;
	if (getObjectSize(V, Size, DL, &TLI))
	return Size;
	return MemoryLocation::UnknownSize;
	}

	namespace {

	enum OverwriteResult {
	OW_Begin,
	OW_Complete,
	OW_End,
	OW_PartialEarlierWithFullLater,
	OW_Unknown
	};

	} // end anonymous namespace

	/// Return 'OW_Complete' if a store to the 'Later' location completely
	/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
	/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
	/// beginning of the 'Earlier' location is overwritten by 'Later'.
	/// 'OW_PartialEarlierWithFullLater' means that an earlier (big) store was
	/// overwritten by a latter (smaller) store which doesn't write outside the big
	/// store's memory locations. Returns 'OW_Unknown' if nothing can be determined.
	static OverwriteResult isOverwrite(const MemoryLocation &Later,
	const MemoryLocation &Earlier,
	const DataLayout &DL,
	const TargetLibraryInfo &TLI,
	int64_t &EarlierOff, int64_t &LaterOff,
	Instruction *DepWrite,
	InstOverlapIntervalsTy &IOL) {
	// If we don't know the sizes of either access, then we can't do a comparison.
	if (Later.Size == MemoryLocation::UnknownSize \|\|
	Earlier.Size == MemoryLocation::UnknownSize)
	return OW_Unknown;

	const Value *P1 = Earlier.Ptr->stripPointerCasts();
	const Value *P2 = Later.Ptr->stripPointerCasts();

	// If the start pointers are the same, we just have to compare sizes to see if
	// the later store was larger than the earlier store.
	if (P1 == P2) {
	// Make sure that the Later size is >= the Earlier size.
	if (Later.Size >= Earlier.Size)
	return OW_Complete;
	}

	// Check to see if the later store is to the entire object (either a global,
	// an alloca, or a byval/inalloca argument). If so, then it clearly
	// overwrites any other store to the same object.
	const Value *UO1 = GetUnderlyingObject(P1, DL),
	*UO2 = GetUnderlyingObject(P2, DL);

	// If we can't resolve the same pointers to the same object, then we can't
	// analyze them at all.
	if (UO1 != UO2)
	return OW_Unknown;

	// If the "Later" store is to a recognizable object, get its size.
	uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
	if (ObjectSize != MemoryLocation::UnknownSize)
	if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
	return OW_Complete;

	// Okay, we have stores to two completely different pointers. Try to
	// decompose the pointer into a "base + constant_offset" form. If the base
	// pointers are equal, then we can reason about the two stores.
	EarlierOff = 0;
	LaterOff = 0;
	const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL);
	const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL);

	// If the base pointers still differ, we have two completely different stores.
	if (BP1 != BP2)
	return OW_Unknown;

	// The later store completely overlaps the earlier store if:
	//
	// 1. Both start at the same offset and the later one's size is greater than
	// or equal to the earlier one's, or
	//
	// \|--earlier--\|
	// \|-- later --\|
	//
	// 2. The earlier store has an offset greater than the later offset, but which
	// still lies completely within the later store.
	//
	// \|--earlier--\|
	// \|----- later ------\|
	//
	// We have to be careful here as Off is signed while .Size is unsigned.
	if (EarlierOff >= LaterOff &&
	Later.Size >= Earlier.Size &&
	uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
	return OW_Complete;

	// We may now overlap, although the overlap is not complete. There might also
	// be other incomplete overlaps, and together, they might cover the complete
	// earlier write.
	// Note: The correctness of this logic depends on the fact that this function
	// is not even called providing DepWrite when there are any intervening reads.
	if (EnablePartialOverwriteTracking &&
	LaterOff < int64_t(EarlierOff + Earlier.Size) &&
	int64_t(LaterOff + Later.Size) >= EarlierOff) {

	// Insert our part of the overlap into the map.
	auto &IM = IOL[DepWrite];
	DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff << ", " <<
	int64_t(EarlierOff + Earlier.Size) << ") Later [" <<
	LaterOff << ", " << int64_t(LaterOff + Later.Size) << ")\n");

	// Make sure that we only insert non-overlapping intervals and combine
	// adjacent intervals. The intervals are stored in the map with the ending
	// offset as the key (in the half-open sense) and the starting offset as
	// the value.
	int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + Later.Size;

	// Find any intervals ending at, or after, LaterIntStart which start
	// before LaterIntEnd.
	auto ILI = IM.lower_bound(LaterIntStart);
	if (ILI != IM.end() && ILI->second <= LaterIntEnd) {
	// This existing interval is overlapped with the current store somewhere
	// in [LaterIntStart, LaterIntEnd]. Merge them by erasing the existing
	// intervals and adjusting our start and end.
	LaterIntStart = std::min(LaterIntStart, ILI->second);
	LaterIntEnd = std::max(LaterIntEnd, ILI->first);
	ILI = IM.erase(ILI);

	// Continue erasing and adjusting our end in case other previous
	// intervals are also overlapped with the current store.
	//
	// \|--- ealier 1 ---\| \|--- ealier 2 ---\|
	// \|------- later---------\|
	//
	while (ILI != IM.end() && ILI->second <= LaterIntEnd) {
	assert(ILI->second > LaterIntStart && "Unexpected interval");
	LaterIntEnd = std::max(LaterIntEnd, ILI->first);
	ILI = IM.erase(ILI);
	}
	}

	IM[LaterIntEnd] = LaterIntStart;

	ILI = IM.begin();
	if (ILI->second <= EarlierOff &&
	ILI->first >= int64_t(EarlierOff + Earlier.Size)) {
	DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" <<
	EarlierOff << ", " <<
	int64_t(EarlierOff + Earlier.Size) <<
	") Composite Later [" <<
	ILI->second << ", " << ILI->first << ")\n");
	++NumCompletePartials;
	return OW_Complete;
	}
	}

	// Check for an earlier store which writes to all the memory locations that
	// the later store writes to.
	if (EnablePartialStoreMerging && LaterOff >= EarlierOff &&
	int64_t(EarlierOff + Earlier.Size) > LaterOff &&
	uint64_t(LaterOff - EarlierOff) + Later.Size <= Earlier.Size) {
	DEBUG(dbgs() << "DSE: Partial overwrite an earlier load [" << EarlierOff
	<< ", " << int64_t(EarlierOff + Earlier.Size)
	<< ") by a later store [" << LaterOff << ", "
	<< int64_t(LaterOff + Later.Size) << ")\n");
	// TODO: Maybe come up with a better name?
	return OW_PartialEarlierWithFullLater;
	}

	// Another interesting case is if the later store overwrites the end of the
	// earlier store.
	//
	// \|--earlier--\|
	// \|-- later --\|
	//
	// In this case we may want to trim the size of earlier to avoid generating
	// writes to addresses which will definitely be overwritten later
	if (!EnablePartialOverwriteTracking &&
	(LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + Earlier.Size) &&
	int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size)))
	return OW_End;

	// Finally, we also need to check if the later store overwrites the beginning
	// of the earlier store.
	//
	// \|--earlier--\|
	// \|-- later --\|
	//
	// In this case we may want to move the destination address and trim the size
	// of earlier to avoid generating writes to addresses which will definitely
	// be overwritten later.
	if (!EnablePartialOverwriteTracking &&
	(LaterOff <= EarlierOff && int64_t(LaterOff + Later.Size) > EarlierOff)) {
	assert(int64_t(LaterOff + Later.Size) <
	int64_t(EarlierOff + Earlier.Size) &&
	"Expect to be handled as OW_Complete");
	return OW_Begin;
	}
	// Otherwise, they don't completely overlap.
	return OW_Unknown;
	}

	/// If 'Inst' might be a self read (i.e. a noop copy of a
	/// memory region into an identical pointer) then it doesn't actually make its
	/// input dead in the traditional sense. Consider this case:
	///
	/// memcpy(A <- B)
	/// memcpy(A <- A)
	///
	/// In this case, the second store to A does not make the first store to A dead.
	/// The usual situation isn't an explicit A<-A store like this (which can be
	/// trivially removed) but a case where two pointers may alias.
	///
	/// This function detects when it is unsafe to remove a dependent instruction
	/// because the DSE inducing instruction may be a self-read.
	static bool isPossibleSelfRead(Instruction *Inst,
	const MemoryLocation &InstStoreLoc,
	Instruction *DepWrite,
	const TargetLibraryInfo &TLI,
	AliasAnalysis &AA) {
	// Self reads can only happen for instructions that read memory. Get the
	// location read.
	MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
	if (!InstReadLoc.Ptr) return false; // Not a reading instruction.

	// If the read and written loc obviously don't alias, it isn't a read.
	if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;

	// Okay, 'Inst' may copy over itself. However, we can still remove a the
	// DepWrite instruction if we can prove that it reads from the same location
	// as Inst. This handles useful cases like:
	// memcpy(A <- B)
	// memcpy(A <- B)
	// Here we don't know if A/B may alias, but we do know that B/B are must
	// aliases, so removing the first memcpy is safe (assuming it writes <= #
	// bytes as the second one.
	MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);

	if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
	return false;

	// If DepWrite doesn't read memory or if we can't prove it is a must alias,
	// then it can't be considered dead.
	return true;
	}

	/// Returns true if the memory which is accessed by the second instruction is not
	/// modified between the first and the second instruction.
	/// Precondition: Second instruction must be dominated by the first
	/// instruction.
	static bool memoryIsNotModifiedBetween(Instruction *FirstI,
	Instruction *SecondI,
	AliasAnalysis *AA) {
	SmallVector<BasicBlock *, 16> WorkList;
	SmallPtrSet<BasicBlock *, 8> Visited;
	BasicBlock::iterator FirstBBI(FirstI);
	++FirstBBI;
	BasicBlock::iterator SecondBBI(SecondI);
	BasicBlock *FirstBB = FirstI->getParent();
	BasicBlock *SecondBB = SecondI->getParent();
	MemoryLocation MemLoc = MemoryLocation::get(SecondI);

	// Start checking the store-block.
	WorkList.push_back(SecondBB);
	bool isFirstBlock = true;

	// Check all blocks going backward until we reach the load-block.
	while (!WorkList.empty()) {
	BasicBlock *B = WorkList.pop_back_val();

	// Ignore instructions before LI if this is the FirstBB.
	BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin());

	BasicBlock::iterator EI;
	if (isFirstBlock) {
	// Ignore instructions after SI if this is the first visit of SecondBB.
	assert(B == SecondBB && "first block is not the store block");
	EI = SecondBBI;
	isFirstBlock = false;
	} else {
	// It's not SecondBB or (in case of a loop) the second visit of SecondBB.
	// In this case we also have to look at instructions after SI.
	EI = B->end();
	}
	for (; BI != EI; ++BI) {
	Instruction I = &BI;
	if (I->mayWriteToMemory() && I != SecondI)
	if (isModSet(AA->getModRefInfo(I, MemLoc)))
	return false;
	}
	if (B != FirstBB) {
	assert(B != &FirstBB->getParent()->getEntryBlock() &&
	"Should not hit the entry block because SI must be dominated by LI");
	for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) {
	if (!Visited.insert(*PredI).second)
	continue;
	WorkList.push_back(*PredI);
	}
	}
	}
	return true;
	}

	/// Find all blocks that will unconditionally lead to the block BB and append
	/// them to F.
	static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
	BasicBlock BB, DominatorTree DT) {
	for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
	BasicBlock Pred = I;
	if (Pred == BB) continue;
	TerminatorInst *PredTI = Pred->getTerminator();
	if (PredTI->getNumSuccessors() != 1)
	continue;

	if (DT->isReachableFromEntry(Pred))
	Blocks.push_back(Pred);
	}
	}

	/// Handle frees of entire structures whose dependency is a store
	/// to a field of that structure.
	static bool handleFree(CallInst F, AliasAnalysis AA,
	MemoryDependenceResults MD, DominatorTree DT,
	const TargetLibraryInfo *TLI,
	InstOverlapIntervalsTy &IOL,
	DenseMap<Instruction, size_t> InstrOrdering) {
	bool MadeChange = false;

	MemoryLocation Loc = MemoryLocation(F->getOperand(0));
	SmallVector<BasicBlock *, 16> Blocks;
	Blocks.push_back(F->getParent());
	const DataLayout &DL = F->getModule()->getDataLayout();

	while (!Blocks.empty()) {
	BasicBlock *BB = Blocks.pop_back_val();
	Instruction *InstPt = BB->getTerminator();
	if (BB == F->getParent()) InstPt = F;

	MemDepResult Dep =
	MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);
	while (Dep.isDef() \|\| Dep.isClobber()) {
	Instruction *Dependency = Dep.getInst();
	if (!hasMemoryWrite(Dependency, *TLI) \|\| !isRemovable(Dependency))
	break;

	Value *DepPointer =
	GetUnderlyingObject(getStoredPointerOperand(Dependency), DL);

	// Check for aliasing.
	if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
	break;

	DEBUG(dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: "
	<< *Dependency << '\n');

	// DCE instructions only used to calculate that store.
	BasicBlock::iterator BBI(Dependency);
	deleteDeadInstruction(Dependency, &BBI, MD, TLI, IOL, InstrOrdering);
	++NumFastStores;
	MadeChange = true;

	// Inst's old Dependency is now deleted. Compute the next dependency,
	// which may also be dead, as in
	// s[0] = 0;
	// s[1] = 0; // This has just been deleted.
	// free(s);
	Dep = MD->getPointerDependencyFrom(Loc, false, BBI, BB);
	}

	if (Dep.isNonLocal())
	findUnconditionalPreds(Blocks, BB, DT);
	}

	return MadeChange;
	}

	/// Check to see if the specified location may alias any of the stack objects in
	/// the DeadStackObjects set. If so, they become live because the location is
	/// being loaded.
	static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
	SmallSetVector<Value *, 16> &DeadStackObjects,
	const DataLayout &DL, AliasAnalysis *AA,
	const TargetLibraryInfo *TLI) {
	const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);

	// A constant can't be in the dead pointer set.
	if (isa<Constant>(UnderlyingPointer))
	return;

	// If the kill pointer can be easily reduced to an alloca, don't bother doing
	// extraneous AA queries.
	if (isa<AllocaInst>(UnderlyingPointer) \|\| isa<Argument>(UnderlyingPointer)) {
	DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer));
	return;
	}

	// Remove objects that could alias LoadedLoc.
	DeadStackObjects.remove_if([&](Value *I) {
	// See if the loaded location could alias the stack location.
	MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI));
	return !AA->isNoAlias(StackLoc, LoadedLoc);
	});
	}

	/// Remove dead stores to stack-allocated locations in the function end block.
	/// Ex:
	/// %A = alloca i32
	/// ...
	/// store i32 1, i32* %A
	/// ret void
	static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
	MemoryDependenceResults *MD,
	const TargetLibraryInfo *TLI,
	InstOverlapIntervalsTy &IOL,
	DenseMap<Instruction, size_t> InstrOrdering) {
	bool MadeChange = false;

	// Keep track of all of the stack objects that are dead at the end of the
	// function.
	SmallSetVector<Value*, 16> DeadStackObjects;

	// Find all of the alloca'd pointers in the entry block.
	BasicBlock &Entry = BB.getParent()->front();
	for (Instruction &I : Entry) {
	if (isa<AllocaInst>(&I))
	DeadStackObjects.insert(&I);

	// Okay, so these are dead heap objects, but if the pointer never escapes
	// then it's leaked by this function anyways.
	else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true))
	DeadStackObjects.insert(&I);
	}

	// Treat byval or inalloca arguments the same, stores to them are dead at the
	// end of the function.
	for (Argument &AI : BB.getParent()->args())
	if (AI.hasByValOrInAllocaAttr())
	DeadStackObjects.insert(&AI);

	const DataLayout &DL = BB.getModule()->getDataLayout();

	// Scan the basic block backwards
	for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
	--BBI;

	// If we find a store, check to see if it points into a dead stack value.
	if (hasMemoryWrite(&BBI, TLI) && isRemovable(&*BBI)) {
	// See through pointer-to-pointer bitcasts
	SmallVector<Value *, 4> Pointers;
	GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL);

	// Stores to stack values are valid candidates for removal.
	bool AllDead = true;
	for (Value *Pointer : Pointers)
	if (!DeadStackObjects.count(Pointer)) {
	AllDead = false;
	break;
	}

	if (AllDead) {
	Instruction Dead = &BBI;

	DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: "
	<< *Dead << "\n Objects: ";
	for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
	E = Pointers.end(); I != E; ++I) {
	dbgs() << **I;
	if (std::next(I) != E)
	dbgs() << ", ";
	}
	dbgs() << '\n');

	// DCE instructions only used to calculate that store.
	deleteDeadInstruction(Dead, &BBI, MD, TLI, IOL, InstrOrdering, &DeadStackObjects);
	++NumFastStores;
	MadeChange = true;
	continue;
	}
	}

	// Remove any dead non-memory-mutating instructions.
	if (isInstructionTriviallyDead(&*BBI, TLI)) {
	DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: "
	<< &BBI << '\n');
	deleteDeadInstruction(&BBI, &BBI, MD, *TLI, IOL, InstrOrdering, &DeadStackObjects);
	++NumFastOther;
	MadeChange = true;
	continue;
	}

	if (isa<AllocaInst>(BBI)) {
	// Remove allocas from the list of dead stack objects; there can't be
	// any references before the definition.
	DeadStackObjects.remove(&*BBI);
	continue;
	}

	if (auto CS = CallSite(&*BBI)) {
	// Remove allocation function calls from the list of dead stack objects;
	// there can't be any references before the definition.
	if (isAllocLikeFn(&*BBI, TLI))
	DeadStackObjects.remove(&*BBI);

	// If this call does not access memory, it can't be loading any of our
	// pointers.
	if (AA->doesNotAccessMemory(CS))
	continue;

	// If the call might load from any of our allocas, then any store above
	// the call is live.
	DeadStackObjects.remove_if([&](Value *I) {
	// See if the call site touches the value.
	return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI)));
	});

	// If all of the allocas were clobbered by the call then we're not going
	// to find anything else to process.
	if (DeadStackObjects.empty())
	break;

	continue;
	}

	// We can remove the dead stores, irrespective of the fence and its ordering
	// (release/acquire/seq_cst). Fences only constraints the ordering of
	// already visible stores, it does not make a store visible to other
	// threads. So, skipping over a fence does not change a store from being
	// dead.
	if (isa<FenceInst>(*BBI))
	continue;

	MemoryLocation LoadedLoc;

	// If we encounter a use of the pointer, it is no longer considered dead
	if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
	if (!L->isUnordered()) // Be conservative with atomic/volatile load
	break;
	LoadedLoc = MemoryLocation::get(L);
	} else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
	LoadedLoc = MemoryLocation::get(V);
	} else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) {
	LoadedLoc = MemoryLocation::getForSource(MTI);
	} else if (!BBI->mayReadFromMemory()) {
	// Instruction doesn't read memory. Note that stores that weren't removed
	// above will hit this case.
	continue;
	} else {
	// Unknown inst; assume it clobbers everything.
	break;
	}

	// Remove any allocas from the DeadPointer set that are loaded, as this
	// makes any stores above the access live.
	removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI);

	// If all of the allocas were clobbered by the access then we're not going
	// to find anything else to process.
	if (DeadStackObjects.empty())
	break;
	}

	return MadeChange;
	}

	static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
	int64_t &EarlierSize, int64_t LaterOffset,
	int64_t LaterSize, bool IsOverwriteEnd) {
	// TODO: base this on the target vector size so that if the earlier
	// store was too small to get vector writes anyway then its likely
	// a good idea to shorten it
	// Power of 2 vector writes are probably always a bad idea to optimize
	// as any store/memset/memcpy is likely using vector instructions so
	// shortening it to not vector size is likely to be slower
	MemIntrinsic *EarlierIntrinsic = cast<MemIntrinsic>(EarlierWrite);
	unsigned EarlierWriteAlign = EarlierIntrinsic->getAlignment();
	if (!IsOverwriteEnd)
	LaterOffset = int64_t(LaterOffset + LaterSize);

	if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
	!((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
	return false;

	DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
	<< (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *EarlierWrite
	<< "\n KILLER (offset " << LaterOffset << ", " << EarlierSize
	<< ")\n");

	int64_t NewLength = IsOverwriteEnd
	? LaterOffset - EarlierOffset
	: EarlierSize - (LaterOffset - EarlierOffset);

	Value *EarlierWriteLength = EarlierIntrinsic->getLength();
	Value *TrimmedLength =
	ConstantInt::get(EarlierWriteLength->getType(), NewLength);
	EarlierIntrinsic->setLength(TrimmedLength);

	EarlierSize = NewLength;
	if (!IsOverwriteEnd) {
	int64_t OffsetMoved = (LaterOffset - EarlierOffset);
	Value *Indices[1] = {
	ConstantInt::get(EarlierWriteLength->getType(), OffsetMoved)};
	GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds(
	EarlierIntrinsic->getRawDest(), Indices, "", EarlierWrite);
	EarlierIntrinsic->setDest(NewDestGEP);
	EarlierOffset = EarlierOffset + OffsetMoved;
	}
	return true;
	}

	static bool tryToShortenEnd(Instruction *EarlierWrite,
	OverlapIntervalsTy &IntervalMap,
	int64_t &EarlierStart, int64_t &EarlierSize) {
	if (IntervalMap.empty() \|\| !isShortenableAtTheEnd(EarlierWrite))
	return false;

	OverlapIntervalsTy::iterator OII = --IntervalMap.end();
	int64_t LaterStart = OII->second;
	int64_t LaterSize = OII->first - LaterStart;

	if (LaterStart > EarlierStart && LaterStart < EarlierStart + EarlierSize &&
	LaterStart + LaterSize >= EarlierStart + EarlierSize) {
	if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
	LaterSize, true)) {
	IntervalMap.erase(OII);
	return true;
	}
	}
	return false;
	}

	static bool tryToShortenBegin(Instruction *EarlierWrite,
	OverlapIntervalsTy &IntervalMap,
	int64_t &EarlierStart, int64_t &EarlierSize) {
	if (IntervalMap.empty() \|\| !isShortenableAtTheBeginning(EarlierWrite))
	return false;

	OverlapIntervalsTy::iterator OII = IntervalMap.begin();
	int64_t LaterStart = OII->second;
	int64_t LaterSize = OII->first - LaterStart;

	if (LaterStart <= EarlierStart && LaterStart + LaterSize > EarlierStart) {
	assert(LaterStart + LaterSize < EarlierStart + EarlierSize &&
	"Should have been handled as OW_Complete");
	if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
	LaterSize, false)) {
	IntervalMap.erase(OII);
	return true;
	}
	}
	return false;
	}

	static bool removePartiallyOverlappedStores(AliasAnalysis *AA,
	const DataLayout &DL,
	InstOverlapIntervalsTy &IOL) {
	bool Changed = false;
	for (auto OI : IOL) {
	Instruction *EarlierWrite = OI.first;
	MemoryLocation Loc = getLocForWrite(EarlierWrite, *AA);
	assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
	assert(Loc.Size != MemoryLocation::UnknownSize && "Unexpected mem loc");

	const Value *Ptr = Loc.Ptr->stripPointerCasts();
	int64_t EarlierStart = 0;
	int64_t EarlierSize = int64_t(Loc.Size);
	GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL);
	OverlapIntervalsTy &IntervalMap = OI.second;
	Changed \|=
	tryToShortenEnd(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
	if (IntervalMap.empty())
	continue;
	Changed \|=
	tryToShortenBegin(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
	}
	return Changed;
	}

	static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
	AliasAnalysis AA, MemoryDependenceResults MD,
	const DataLayout &DL,
	const TargetLibraryInfo *TLI,
	InstOverlapIntervalsTy &IOL,
	DenseMap<Instruction, size_t> InstrOrdering) {
	// Must be a store instruction.
	StoreInst *SI = dyn_cast<StoreInst>(Inst);
	if (!SI)
	return false;

	// If we're storing the same value back to a pointer that we just loaded from,
	// then the store can be removed.
	if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
	if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
	isRemovable(SI) && memoryIsNotModifiedBetween(DepLoad, SI, AA)) {

	DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: "
	<< DepLoad << "\n STORE: " << SI << '\n');

	deleteDeadInstruction(SI, &BBI, MD, TLI, IOL, InstrOrdering);
	++NumRedundantStores;
	return true;
	}
	}

	// Remove null stores into the calloc'ed objects
	Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
	if (StoredConstant && StoredConstant->isNullValue() && isRemovable(SI)) {
	Instruction *UnderlyingPointer =
	dyn_cast<Instruction>(GetUnderlyingObject(SI->getPointerOperand(), DL));

	if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
	memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA)) {
	DEBUG(
	dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: "
	<< Inst << "\n OBJECT: " << UnderlyingPointer << '\n');

	deleteDeadInstruction(SI, &BBI, MD, TLI, IOL, InstrOrdering);
	++NumRedundantStores;
	return true;
	}
	}
	return false;
	}

	static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
	MemoryDependenceResults MD, DominatorTree DT,
	const TargetLibraryInfo *TLI) {
	const DataLayout &DL = BB.getModule()->getDataLayout();
	bool MadeChange = false;

	// FIXME: Maybe change this to use some abstraction like OrderedBasicBlock?
	// The current OrderedBasicBlock can't deal with mutation at the moment.
	size_t LastThrowingInstIndex = 0;
	DenseMap<Instruction*, size_t> InstrOrdering;
	size_t InstrIndex = 1;

	// A map of interval maps representing partially-overwritten value parts.
	InstOverlapIntervalsTy IOL;

	// Do a top-down walk on the BB.
	for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
	// Handle 'free' calls specially.
	if (CallInst F = isFreeCall(&BBI, TLI)) {
	MadeChange \|= handleFree(F, AA, MD, DT, TLI, IOL, &InstrOrdering);
	// Increment BBI after handleFree has potentially deleted instructions.
	// This ensures we maintain a valid iterator.
	++BBI;
	continue;
	}

	Instruction Inst = &BBI++;

	size_t CurInstNumber = InstrIndex++;
	InstrOrdering.insert(std::make_pair(Inst, CurInstNumber));
	if (Inst->mayThrow()) {
	LastThrowingInstIndex = CurInstNumber;
	continue;
	}

	// Check to see if Inst writes to memory. If not, continue.
	if (!hasMemoryWrite(Inst, *TLI))
	continue;

	// eliminateNoopStore will update in iterator, if necessary.
	if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, &InstrOrdering)) {
	MadeChange = true;
	continue;
	}

	// If we find something that writes memory, get its memory dependence.
	MemDepResult InstDep = MD->getDependency(Inst);

	// Ignore any store where we can't find a local dependence.
	// FIXME: cross-block DSE would be fun. :)
	if (!InstDep.isDef() && !InstDep.isClobber())
	continue;

	// Figure out what location is being stored to.
	MemoryLocation Loc = getLocForWrite(Inst, *AA);

	// If we didn't get a useful location, fail.
	if (!Loc.Ptr)
	continue;

	// Loop until we find a store we can eliminate or a load that
	// invalidates the analysis. Without an upper bound on the number of
	// instructions examined, this analysis can become very time-consuming.
	// However, the potential gain diminishes as we process more instructions
	// without eliminating any of them. Therefore, we limit the number of
	// instructions we look at.
	auto Limit = MD->getDefaultBlockScanLimit();
	while (InstDep.isDef() \|\| InstDep.isClobber()) {
	// Get the memory clobbered by the instruction we depend on. MemDep will
	// skip any instructions that 'Loc' clearly doesn't interact with. If we
	// end up depending on a may- or must-aliased load, then we can't optimize
	// away the store and we bail out. However, if we depend on something
	// that overwrites the memory location we can potentially optimize it.
	//
	// Find out what memory location the dependent instruction stores.
	Instruction *DepWrite = InstDep.getInst();
	MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
	// If we didn't get a useful location, or if it isn't a size, bail out.
	if (!DepLoc.Ptr)
	break;

	// Make sure we don't look past a call which might throw. This is an
	// issue because MemoryDependenceAnalysis works in the wrong direction:
	// it finds instructions which dominate the current instruction, rather than
	// instructions which are post-dominated by the current instruction.
	//
	// If the underlying object is a non-escaping memory allocation, any store
	// to it is dead along the unwind edge. Otherwise, we need to preserve
	// the store.
	size_t DepIndex = InstrOrdering.lookup(DepWrite);
	assert(DepIndex && "Unexpected instruction");
	if (DepIndex <= LastThrowingInstIndex) {
	const Value* Underlying = GetUnderlyingObject(DepLoc.Ptr, DL);
	bool IsStoreDeadOnUnwind = isa<AllocaInst>(Underlying);
	if (!IsStoreDeadOnUnwind) {
	// We're looking for a call to an allocation function
	// where the allocation doesn't escape before the last
	// throwing instruction; PointerMayBeCaptured
	// reasonably fast approximation.
	IsStoreDeadOnUnwind = isAllocLikeFn(Underlying, TLI) &&
	!PointerMayBeCaptured(Underlying, false, true);
	}
	if (!IsStoreDeadOnUnwind)
	break;
	}

	// If we find a write that is a) removable (i.e., non-volatile), b) is
	// completely obliterated by the store to 'Loc', and c) which we know that
	// 'Inst' doesn't load from, then we can remove it.
	// Also try to merge two stores if a later one only touches memory written
	// to by the earlier one.
	if (isRemovable(DepWrite) &&
	!isPossibleSelfRead(Inst, Loc, DepWrite, TLI, AA)) {
	int64_t InstWriteOffset, DepWriteOffset;
	OverwriteResult OR =
	isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset,
	DepWrite, IOL);
	if (OR == OW_Complete) {
	DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: "
	<< DepWrite << "\n KILLER: " << Inst << '\n');

	// Delete the store and now-dead instructions that feed it.
	deleteDeadInstruction(DepWrite, &BBI, MD, TLI, IOL, &InstrOrdering);
	++NumFastStores;
	MadeChange = true;

	// We erased DepWrite; start over.
	InstDep = MD->getDependency(Inst);
	continue;
	} else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) \|\|
	((OR == OW_Begin &&
	isShortenableAtTheBeginning(DepWrite)))) {
	assert(!EnablePartialOverwriteTracking && "Do not expect to perform "
	"when partial-overwrite "
	"tracking is enabled");
	int64_t EarlierSize = DepLoc.Size;
	int64_t LaterSize = Loc.Size;
	bool IsOverwriteEnd = (OR == OW_End);
	MadeChange \|= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
	InstWriteOffset, LaterSize, IsOverwriteEnd);
	} else if (EnablePartialStoreMerging &&
	OR == OW_PartialEarlierWithFullLater) {
	auto *Earlier = dyn_cast<StoreInst>(DepWrite);
	auto *Later = dyn_cast<StoreInst>(Inst);
	if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
	- Later && isa<ConstantInt>(Later->getValueOperand())) {
	+ Later && isa<ConstantInt>(Later->getValueOperand()) &&
	+ memoryIsNotModifiedBetween(Earlier, Later, AA)) {
	// If the store we find is:
	// a) partially overwritten by the store to 'Loc'
	// b) the later store is fully contained in the earlier one and
	// c) they both have a constant value
	// Merge the two stores, replacing the earlier store's value with a
	// merge of both values.
	// TODO: Deal with other constant types (vectors, etc), and probably
	// some mem intrinsics (if needed)

	APInt EarlierValue =
	cast<ConstantInt>(Earlier->getValueOperand())->getValue();
	APInt LaterValue =
	cast<ConstantInt>(Later->getValueOperand())->getValue();
	unsigned LaterBits = LaterValue.getBitWidth();
	assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth());
	LaterValue = LaterValue.zext(EarlierValue.getBitWidth());

	// Offset of the smaller store inside the larger store
	unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8;
	unsigned LShiftAmount =
	DL.isBigEndian()
	? EarlierValue.getBitWidth() - BitOffsetDiff - LaterBits
	: BitOffsetDiff;
	APInt Mask =
	APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount,
	LShiftAmount + LaterBits);
	// Clear the bits we'll be replacing, then OR with the smaller
	// store, shifted appropriately.
	APInt Merged =
	(EarlierValue & ~Mask) \| (LaterValue << LShiftAmount);
	DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *DepWrite
	<< "\n Later: " << *Inst
	<< "\n Merged Value: " << Merged << '\n');

	auto *SI = new StoreInst(
	ConstantInt::get(Earlier->getValueOperand()->getType(), Merged),
	Earlier->getPointerOperand(), false, Earlier->getAlignment(),
	Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite);

	unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa,
	LLVMContext::MD_alias_scope,
	LLVMContext::MD_noalias,
	LLVMContext::MD_nontemporal};
	SI->copyMetadata(*DepWrite, MDToKeep);
	++NumModifiedStores;

	// Remove earlier, wider, store
	size_t Idx = InstrOrdering.lookup(DepWrite);
	InstrOrdering.erase(DepWrite);
	InstrOrdering.insert(std::make_pair(SI, Idx));

	// Delete the old stores and now-dead instructions that feed them.
	deleteDeadInstruction(Inst, &BBI, MD, TLI, IOL, &InstrOrdering);
	deleteDeadInstruction(DepWrite, &BBI, MD, TLI, IOL,
	&InstrOrdering);
	MadeChange = true;

	// We erased DepWrite and Inst (Loc); start over.
	break;
	}
	}
	}

	// If this is a may-aliased store that is clobbering the store value, we
	// can keep searching past it for another must-aliased pointer that stores
	// to the same location. For example, in:
	// store -> P
	// store -> Q
	// store -> P
	// we can remove the first store to P even though we don't know if P and Q
	// alias.
	if (DepWrite == &BB.front()) break;

	// Can't look past this instruction if it might read 'Loc'.
	if (isRefSet(AA->getModRefInfo(DepWrite, Loc)))
	break;

	InstDep = MD->getPointerDependencyFrom(Loc, /isLoad=/ false,
	DepWrite->getIterator(), &BB,
	/QueryInst=/ nullptr, &Limit);
	}
	}

	if (EnablePartialOverwriteTracking)
	MadeChange \|= removePartiallyOverlappedStores(AA, DL, IOL);

	// If this block ends in a return, unwind, or unreachable, all allocas are
	// dead at its end, which means stores to them are also dead.
	if (BB.getTerminator()->getNumSuccessors() == 0)
	MadeChange \|= handleEndBlock(BB, AA, MD, TLI, IOL, &InstrOrdering);

	return MadeChange;
	}

	static bool eliminateDeadStores(Function &F, AliasAnalysis *AA,
	MemoryDependenceResults MD, DominatorTree DT,
	const TargetLibraryInfo *TLI) {
	bool MadeChange = false;
	for (BasicBlock &BB : F)
	// Only check non-dead blocks. Dead blocks may have strange pointer
	// cycles that will confuse alias analysis.
	if (DT->isReachableFromEntry(&BB))
	MadeChange \|= eliminateDeadStores(BB, AA, MD, DT, TLI);

	return MadeChange;
	}

	//===----------------------------------------------------------------------===//
	// DSE Pass
	//===----------------------------------------------------------------------===//
	PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
	AliasAnalysis *AA = &AM.getResult<AAManager>(F);
	DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
	MemoryDependenceResults *MD = &AM.getResult<MemoryDependenceAnalysis>(F);
	const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);

	if (!eliminateDeadStores(F, AA, MD, DT, TLI))
	return PreservedAnalyses::all();

	PreservedAnalyses PA;
	PA.preserveSet<CFGAnalyses>();
	PA.preserve<GlobalsAA>();
	PA.preserve<MemoryDependenceAnalysis>();
	return PA;
	}

	namespace {

	/// A legacy pass for the legacy pass manager that wraps \c DSEPass.
	class DSELegacyPass : public FunctionPass {
	public:
	static char ID; // Pass identification, replacement for typeid

	DSELegacyPass() : FunctionPass(ID) {
	initializeDSELegacyPassPass(*PassRegistry::getPassRegistry());
	}

	bool runOnFunction(Function &F) override {
	if (skipFunction(F))
	return false;

	DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
	AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
	MemoryDependenceResults *MD =
	&getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
	const TargetLibraryInfo *TLI =
	&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();

	return eliminateDeadStores(F, AA, MD, DT, TLI);
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.addRequired<AAResultsWrapperPass>();
	AU.addRequired<MemoryDependenceWrapperPass>();
	AU.addRequired<TargetLibraryInfoWrapperPass>();
	AU.addPreserved<DominatorTreeWrapperPass>();
	AU.addPreserved<GlobalsAAWrapperPass>();
	AU.addPreserved<MemoryDependenceWrapperPass>();
	}
	};

	} // end anonymous namespace

	char DSELegacyPass::ID = 0;

	INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
	false)
	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
	INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
	false)

	FunctionPass *llvm::createDeadStoreEliminationPass() {
	return new DSELegacyPass();
	}
	Index: head/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
	===================================================================
	--- head/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp (revision 328816)
	+++ head/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp (revision 328817)
	@@ -1,1308 +1,1310 @@
	//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Loop unrolling may create many similar GEPs for array accesses.
	// e.g., a 2-level loop
	//
	// float a[32][32]; // global variable
	//
	// for (int i = 0; i < 2; ++i) {
	// for (int j = 0; j < 2; ++j) {
	// ...
	// ... = a[x + i][y + j];
	// ...
	// }
	// }
	//
	// will probably be unrolled to:
	//
	// gep %a, 0, %x, %y; load
	// gep %a, 0, %x, %y + 1; load
	// gep %a, 0, %x + 1, %y; load
	// gep %a, 0, %x + 1, %y + 1; load
	//
	// LLVM's GVN does not use partial redundancy elimination yet, and is thus
	// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
	// significant slowdown in targets with limited addressing modes. For instance,
	// because the PTX target does not support the reg+reg addressing mode, the
	// NVPTX backend emits PTX code that literally computes the pointer address of
	// each GEP, wasting tons of registers. It emits the following PTX for the
	// first load and similar PTX for other loads.
	//
	// mov.u32 %r1, %x;
	// mov.u32 %r2, %y;
	// mul.wide.u32 %rl2, %r1, 128;
	// mov.u64 %rl3, a;
	// add.s64 %rl4, %rl3, %rl2;
	// mul.wide.u32 %rl5, %r2, 4;
	// add.s64 %rl6, %rl4, %rl5;
	// ld.global.f32 %f1, [%rl6];
	//
	// To reduce the register pressure, the optimization implemented in this file
	// merges the common part of a group of GEPs, so we can compute each pointer
	// address by adding a simple offset to the common part, saving many registers.
	//
	// It works by splitting each GEP into a variadic base and a constant offset.
	// The variadic base can be computed once and reused by multiple GEPs, and the
	// constant offsets can be nicely folded into the reg+immediate addressing mode
	// (supported by most targets) without using any extra register.
	//
	// For instance, we transform the four GEPs and four loads in the above example
	// into:
	//
	// base = gep a, 0, x, y
	// load base
	// laod base + 1 * sizeof(float)
	// load base + 32 * sizeof(float)
	// load base + 33 * sizeof(float)
	//
	// Given the transformed IR, a backend that supports the reg+immediate
	// addressing mode can easily fold the pointer arithmetics into the loads. For
	// example, the NVPTX backend can easily fold the pointer arithmetics into the
	// ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
	//
	// mov.u32 %r1, %tid.x;
	// mov.u32 %r2, %tid.y;
	// mul.wide.u32 %rl2, %r1, 128;
	// mov.u64 %rl3, a;
	// add.s64 %rl4, %rl3, %rl2;
	// mul.wide.u32 %rl5, %r2, 4;
	// add.s64 %rl6, %rl4, %rl5;
	// ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX
	// ld.global.f32 %f2, [%rl6+4]; // much better
	// ld.global.f32 %f3, [%rl6+128]; // much better
	// ld.global.f32 %f4, [%rl6+132]; // much better
	//
	// Another improvement enabled by the LowerGEP flag is to lower a GEP with
	// multiple indices to either multiple GEPs with a single index or arithmetic
	// operations (depending on whether the target uses alias analysis in codegen).
	// Such transformation can have following benefits:
	// (1) It can always extract constants in the indices of structure type.
	// (2) After such Lowering, there are more optimization opportunities such as
	// CSE, LICM and CGP.
	//
	// E.g. The following GEPs have multiple indices:
	// BB1:
	// %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
	// load %p
	// ...
	// BB2:
	// %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
	// load %p2
	// ...
	//
	// We can not do CSE to the common part related to index "i64 %i". Lowering
	// GEPs can achieve such goals.
	// If the target does not use alias analysis in codegen, this pass will
	// lower a GEP with multiple indices into arithmetic operations:
	// BB1:
	// %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
	// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
	// %3 = add i64 %1, %2 ; CSE opportunity
	// %4 = mul i64 %j1, length_of_struct
	// %5 = add i64 %3, %4
	// %6 = add i64 %3, struct_field_3 ; Constant offset
	// %p = inttoptr i64 %6 to i32*
	// load %p
	// ...
	// BB2:
	// %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
	// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
	// %9 = add i64 %7, %8 ; CSE opportunity
	// %10 = mul i64 %j2, length_of_struct
	// %11 = add i64 %9, %10
	// %12 = add i64 %11, struct_field_2 ; Constant offset
	// %p = inttoptr i64 %12 to i32*
	// load %p2
	// ...
	//
	// If the target uses alias analysis in codegen, this pass will lower a GEP
	// with multiple indices into multiple GEPs with a single index:
	// BB1:
	// %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
	// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
	// %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity
	// %4 = mul i64 %j1, length_of_struct
	// %5 = getelementptr i8* %3, i64 %4
	// %6 = getelementptr i8* %5, struct_field_3 ; Constant offset
	// %p = bitcast i8* %6 to i32*
	// load %p
	// ...
	// BB2:
	// %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
	// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
	// %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity
	// %10 = mul i64 %j2, length_of_struct
	// %11 = getelementptr i8* %9, i64 %10
	// %12 = getelementptr i8* %11, struct_field_2 ; Constant offset
	// %p2 = bitcast i8* %12 to i32*
	// load %p2
	// ...
	//
	// Lowering GEPs can also benefit other passes such as LICM and CGP.
	// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
	// indices if one of the index is variant. If we lower such GEP into invariant
	// parts and variant parts, LICM can hoist/sink those invariant parts.
	// CGP (CodeGen Prepare) tries to sink address calculations that match the
	// target's addressing modes. A GEP with multiple indices may not match and will
	// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
	// them. So we end up with a better addressing mode.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DepthFirstIterator.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/MemoryBuiltins.h"
	#include "llvm/Analysis/ScalarEvolution.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Transforms/Scalar.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include <cassert>
	#include <cstdint>
	#include <string>

	using namespace llvm;
	using namespace llvm::PatternMatch;

	static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
	"disable-separate-const-offset-from-gep", cl::init(false),
	cl::desc("Do not separate the constant offset from a GEP instruction"),
	cl::Hidden);

	// Setting this flag may emit false positives when the input module already
	// contains dead instructions. Therefore, we set it only in unit tests that are
	// free of dead code.
	static cl::opt<bool>
	VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false),
	cl::desc("Verify this pass produces no dead code"),
	cl::Hidden);

	namespace {

	/// \brief A helper class for separating a constant offset from a GEP index.
	///
	/// In real programs, a GEP index may be more complicated than a simple addition
	/// of something and a constant integer which can be trivially splitted. For
	/// example, to split ((a << 3) \| 5) + b, we need to search deeper for the
	/// constant offset, so that we can separate the index to (a << 3) + b and 5.
	///
	/// Therefore, this class looks into the expression that computes a given GEP
	/// index, and tries to find a constant integer that can be hoisted to the
	/// outermost level of the expression as an addition. Not every constant in an
	/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
	/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
	/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
	class ConstantOffsetExtractor {
	public:
	/// Extracts a constant offset from the given GEP index. It returns the
	/// new index representing the remainder (equal to the original index minus
	/// the constant offset), or nullptr if we cannot extract a constant offset.
	/// \p Idx The given GEP index
	/// \p GEP The given GEP
	/// \p UserChainTail Outputs the tail of UserChain so that we can
	/// garbage-collect unused instructions in UserChain.
	static Value Extract(Value Idx, GetElementPtrInst *GEP,
	User &UserChainTail, const DominatorTree DT);

	/// Looks for a constant offset from the given GEP index without extracting
	/// it. It returns the numeric value of the extracted constant offset (0 if
	/// failed). The meaning of the arguments are the same as Extract.
	static int64_t Find(Value Idx, GetElementPtrInst GEP,
	const DominatorTree *DT);

	private:
	ConstantOffsetExtractor(Instruction InsertionPt, const DominatorTree DT)
	: IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
	}

	/// Searches the expression that computes V for a non-zero constant C s.t.
	/// V can be reassociated into the form V' + C. If the searching is
	/// successful, returns C and update UserChain as a def-use chain from C to V;
	/// otherwise, UserChain is empty.
	///
	/// \p V The given expression
	/// \p SignExtended Whether V will be sign-extended in the computation of the
	/// GEP index
	/// \p ZeroExtended Whether V will be zero-extended in the computation of the
	/// GEP index
	/// \p NonNegative Whether V is guaranteed to be non-negative. For example,
	/// an index of an inbounds GEP is guaranteed to be
	/// non-negative. Levaraging this, we can better split
	/// inbounds GEPs.
	APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);

	/// A helper function to look into both operands of a binary operator.
	APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
	bool ZeroExtended);

	/// After finding the constant offset C from the GEP index I, we build a new
	/// index I' s.t. I' + C = I. This function builds and returns the new
	/// index I' according to UserChain produced by function "find".
	///
	/// The building conceptually takes two steps:
	/// 1) iteratively distribute s/zext towards the leaves of the expression tree
	/// that computes I
	/// 2) reassociate the expression tree to the form I' + C.
	///
	/// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
	/// sext to a, b and 5 so that we have
	/// sext(a) + (sext(b) + 5).
	/// Then, we reassociate it to
	/// (sext(a) + sext(b)) + 5.
	/// Given this form, we know I' is sext(a) + sext(b).
	Value *rebuildWithoutConstOffset();

	/// After the first step of rebuilding the GEP index without the constant
	/// offset, distribute s/zext to the operands of all operators in UserChain.
	/// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
	/// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
	///
	/// The function also updates UserChain to point to new subexpressions after
	/// distributing s/zext. e.g., the old UserChain of the above example is
	/// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
	/// and the new UserChain is
	/// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
	/// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
	///
	/// \p ChainIndex The index to UserChain. ChainIndex is initially
	/// UserChain.size() - 1, and is decremented during
	/// the recursion.
	Value *distributeExtsAndCloneChain(unsigned ChainIndex);

	/// Reassociates the GEP index to the form I' + C and returns I'.
	Value *removeConstOffset(unsigned ChainIndex);

	/// A helper function to apply ExtInsts, a list of s/zext, to value V.
	/// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
	/// returns "sext i32 (zext i16 V to i32) to i64".
	Value applyExts(Value V);

	/// A helper function that returns whether we can trace into the operands
	/// of binary operator BO for a constant offset.
	///
	/// \p SignExtended Whether BO is surrounded by sext
	/// \p ZeroExtended Whether BO is surrounded by zext
	/// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
	/// array index.
	bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
	bool NonNegative);

	/// The path from the constant offset to the old GEP index. e.g., if the GEP
	/// index is "a * b + (c + 5)". After running function find, UserChain[0] will
	/// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
	/// UserChain[2] will be the entire expression "a * b + (c + 5)".
	///
	/// This path helps to rebuild the new GEP index.
	SmallVector<User *, 8> UserChain;

	/// A data structure used in rebuildWithoutConstOffset. Contains all
	/// sext/zext instructions along UserChain.
	SmallVector<CastInst *, 16> ExtInsts;

	/// Insertion position of cloned instructions.
	Instruction *IP;

	const DataLayout &DL;
	const DominatorTree *DT;
	};

	/// \brief A pass that tries to split every GEP in the function into a variadic
	/// base and a constant offset. It is a FunctionPass because searching for the
	/// constant offset may inspect other basic blocks.
	class SeparateConstOffsetFromGEP : public FunctionPass {
	public:
	static char ID;

	SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
	bool LowerGEP = false)
	: FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) {
	initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.addRequired<ScalarEvolutionWrapperPass>();
	AU.addRequired<TargetTransformInfoWrapperPass>();
	AU.addRequired<LoopInfoWrapperPass>();
	AU.setPreservesCFG();
	AU.addRequired<TargetLibraryInfoWrapperPass>();
	}

	bool doInitialization(Module &M) override {
	DL = &M.getDataLayout();
	return false;
	}

	bool runOnFunction(Function &F) override;

	private:
	/// Tries to split the given GEP into a variadic base and a constant offset,
	/// and returns true if the splitting succeeds.
	bool splitGEP(GetElementPtrInst *GEP);

	/// Lower a GEP with multiple indices into multiple GEPs with a single index.
	/// Function splitGEP already split the original GEP into a variadic part and
	/// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
	/// variadic part into a set of GEPs with a single index and applies
	/// AccumulativeByteOffset to it.
	/// \p Variadic The variadic part of the original GEP.
	/// \p AccumulativeByteOffset The constant offset.
	void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
	int64_t AccumulativeByteOffset);

	/// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
	/// Function splitGEP already split the original GEP into a variadic part and
	/// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
	/// variadic part into a set of arithmetic operations and applies
	/// AccumulativeByteOffset to it.
	/// \p Variadic The variadic part of the original GEP.
	/// \p AccumulativeByteOffset The constant offset.
	void lowerToArithmetics(GetElementPtrInst *Variadic,
	int64_t AccumulativeByteOffset);

	/// Finds the constant offset within each index and accumulates them. If
	/// LowerGEP is true, it finds in indices of both sequential and structure
	/// types, otherwise it only finds in sequential indices. The output
	/// NeedsExtraction indicates whether we successfully find a non-zero constant
	/// offset.
	int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);

	/// Canonicalize array indices to pointer-size integers. This helps to
	/// simplify the logic of splitting a GEP. For example, if a + b is a
	/// pointer-size integer, we have
	/// gep base, a + b = gep (gep base, a), b
	/// However, this equality may not hold if the size of a + b is smaller than
	/// the pointer size, because LLVM conceptually sign-extends GEP indices to
	/// pointer size before computing the address
	/// (http://llvm.org/docs/LangRef.html#id181).
	///
	/// This canonicalization is very likely already done in clang and
	/// instcombine. Therefore, the program will probably remain the same.
	///
	/// Returns true if the module changes.
	///
	/// Verified in @i32_add in split-gep.ll
	bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);

	/// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
	/// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
	/// the constant offset. After extraction, it becomes desirable to reunion the
	/// distributed sexts. For example,
	///
	/// &a[sext(i +nsw (j +nsw 5)]
	/// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)]
	/// => constant extraction &a[sext(i) + sext(j)] + 5
	/// => reunion &a[sext(i +nsw j)] + 5
	bool reuniteExts(Function &F);

	/// A helper that reunites sexts in an instruction.
	bool reuniteExts(Instruction *I);

	/// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
	Instruction findClosestMatchingDominator(const SCEV Key,
	Instruction *Dominatee);
	/// Verify F is free of dead code.
	void verifyNoDeadCode(Function &F);

	bool hasMoreThanOneUseInLoop(Value v, Loop L);

	// Swap the index operand of two GEP.
	void swapGEPOperand(GetElementPtrInst First, GetElementPtrInst Second);

	// Check if it is safe to swap operand of two GEP.
	bool isLegalToSwapOperand(GetElementPtrInst First, GetElementPtrInst Second,
	Loop *CurLoop);

	const DataLayout *DL = nullptr;
	DominatorTree *DT = nullptr;
	ScalarEvolution *SE;
	const TargetMachine *TM;

	LoopInfo *LI;
	TargetLibraryInfo *TLI;

	/// Whether to lower a GEP with multiple indices into arithmetic operations or
	/// multiple GEPs with a single index.
	bool LowerGEP;

	DenseMap<const SCEV , SmallVector<Instruction , 2>> DominatingExprs;
	};

	} // end anonymous namespace

	char SeparateConstOffsetFromGEP::ID = 0;

	INITIALIZE_PASS_BEGIN(
	SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
	"Split GEPs to a variadic base and a constant offset for better CSE", false,
	false)
	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
	INITIALIZE_PASS_END(
	SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
	"Split GEPs to a variadic base and a constant offset for better CSE", false,
	false)

	FunctionPass *
	llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM,
	bool LowerGEP) {
	return new SeparateConstOffsetFromGEP(TM, LowerGEP);
	}

	bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
	bool ZeroExtended,
	BinaryOperator *BO,
	bool NonNegative) {
	// We only consider ADD, SUB and OR, because a non-zero constant found in
	// expressions composed of these operations can be easily hoisted as a
	// constant offset by reassociation.
	if (BO->getOpcode() != Instruction::Add &&
	BO->getOpcode() != Instruction::Sub &&
	BO->getOpcode() != Instruction::Or) {
	return false;
	}

	Value LHS = BO->getOperand(0), RHS = BO->getOperand(1);
	// Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
	// don't have common bits, (LHS \| RHS) is equivalent to (LHS + RHS).
	if (BO->getOpcode() == Instruction::Or &&
	!haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
	return false;

	// In addition, tracing into BO requires that its surrounding s/zext (if
	// any) is distributable to both operands.
	//
	// Suppose BO = A op B.
	// SignExtended \| ZeroExtended \| Distributable?
	// --------------+--------------+----------------------------------
	// 0 \| 0 \| true because no s/zext exists
	// 0 \| 1 \| zext(BO) == zext(A) op zext(B)
	// 1 \| 0 \| sext(BO) == sext(A) op sext(B)
	// 1 \| 1 \| zext(sext(BO)) ==
	// \| \| zext(sext(A)) op zext(sext(B))
	if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
	// If a + b >= 0 and (a >= 0 or b >= 0), then
	// sext(a + b) = sext(a) + sext(b)
	// even if the addition is not marked nsw.
	//
	// Leveraging this invarient, we can trace into an sext'ed inbound GEP
	// index if the constant offset is non-negative.
	//
	// Verified in @sext_add in split-gep.ll.
	if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
	if (!ConstLHS->isNegative())
	return true;
	}
	if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
	if (!ConstRHS->isNegative())
	return true;
	}
	}

	// sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
	// zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
	if (BO->getOpcode() == Instruction::Add \|\|
	BO->getOpcode() == Instruction::Sub) {
	if (SignExtended && !BO->hasNoSignedWrap())
	return false;
	if (ZeroExtended && !BO->hasNoUnsignedWrap())
	return false;
	}

	return true;
	}

	APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
	bool SignExtended,
	bool ZeroExtended) {
	// BO being non-negative does not shed light on whether its operands are
	// non-negative. Clear the NonNegative flag here.
	APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
	/* NonNegative */ false);
	// If we found a constant offset in the left operand, stop and return that.
	// This shortcut might cause us to miss opportunities of combining the
	// constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
	// However, such cases are probably already handled by -instcombine,
	// given this pass runs after the standard optimizations.
	if (ConstantOffset != 0) return ConstantOffset;
	ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
	/* NonNegative */ false);
	// If U is a sub operator, negate the constant offset found in the right
	// operand.
	if (BO->getOpcode() == Instruction::Sub)
	ConstantOffset = -ConstantOffset;
	return ConstantOffset;
	}

	APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
	bool ZeroExtended, bool NonNegative) {
	// TODO(jingyue): We could trace into integer/pointer casts, such as
	// inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
	// integers because it gives good enough results for our benchmarks.
	unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();

	// We cannot do much with Values that are not a User, such as an Argument.
	User *U = dyn_cast<User>(V);
	if (U == nullptr) return APInt(BitWidth, 0);

	APInt ConstantOffset(BitWidth, 0);
	if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
	// Hooray, we found it!
	ConstantOffset = CI->getValue();
	} else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
	// Trace into subexpressions for more hoisting opportunities.
	if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
	ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
	} else if (isa<SExtInst>(V)) {
	ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
	ZeroExtended, NonNegative).sext(BitWidth);
	} else if (isa<ZExtInst>(V)) {
	// As an optimization, we can clear the SignExtended flag because
	// sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
	//
	// Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
	ConstantOffset =
	find(U->getOperand(0), /* SignExtended */ false,
	/* ZeroExtended / true, / NonNegative */ false).zext(BitWidth);
	}

	// If we found a non-zero constant offset, add it to the path for
	// rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
	// help this optimization.
	if (ConstantOffset != 0)
	UserChain.push_back(U);
	return ConstantOffset;
	}

	Value ConstantOffsetExtractor::applyExts(Value V) {
	Value *Current = V;
	// ExtInsts is built in the use-def order. Therefore, we apply them to V
	// in the reversed order.
	for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
	if (Constant *C = dyn_cast<Constant>(Current)) {
	// If Current is a constant, apply s/zext using ConstantExpr::getCast.
	// ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
	Current = ConstantExpr::getCast((I)->getOpcode(), C, (I)->getType());
	} else {
	Instruction Ext = (I)->clone();
	Ext->setOperand(0, Current);
	Ext->insertBefore(IP);
	Current = Ext;
	}
	}
	return Current;
	}

	Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
	distributeExtsAndCloneChain(UserChain.size() - 1);
	// Remove all nullptrs (used to be s/zext) from UserChain.
	unsigned NewSize = 0;
	for (User *I : UserChain) {
	if (I != nullptr) {
	UserChain[NewSize] = I;
	NewSize++;
	}
	}
	UserChain.resize(NewSize);
	return removeConstOffset(UserChain.size() - 1);
	}

	Value *
	ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
	User *U = UserChain[ChainIndex];
	if (ChainIndex == 0) {
	assert(isa<ConstantInt>(U));
	// If U is a ConstantInt, applyExts will return a ConstantInt as well.
	return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
	}

	if (CastInst *Cast = dyn_cast<CastInst>(U)) {
	assert((isa<SExtInst>(Cast) \|\| isa<ZExtInst>(Cast)) &&
	"We only traced into two types of CastInst: sext and zext");
	ExtInsts.push_back(Cast);
	UserChain[ChainIndex] = nullptr;
	return distributeExtsAndCloneChain(ChainIndex - 1);
	}

	// Function find only trace into BinaryOperator and CastInst.
	BinaryOperator *BO = cast<BinaryOperator>(U);
	// OpNo = which operand of BO is UserChain[ChainIndex - 1]
	unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
	Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
	Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);

	BinaryOperator *NewBO = nullptr;
	if (OpNo == 0) {
	NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
	BO->getName(), IP);
	} else {
	NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
	BO->getName(), IP);
	}
	return UserChain[ChainIndex] = NewBO;
	}

	Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
	if (ChainIndex == 0) {
	assert(isa<ConstantInt>(UserChain[ChainIndex]));
	return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
	}

	BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
	assert(BO->getNumUses() <= 1 &&
	"distributeExtsAndCloneChain clones each BinaryOperator in "
	"UserChain, so no one should be used more than "
	"once");

	unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
	assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
	Value *NextInChain = removeConstOffset(ChainIndex - 1);
	Value *TheOther = BO->getOperand(1 - OpNo);

	// If NextInChain is 0 and not the LHS of a sub, we can simplify the
	// sub-expression to be just TheOther.
	if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
	if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
	return TheOther;
	}

	BinaryOperator::BinaryOps NewOp = BO->getOpcode();
	if (BO->getOpcode() == Instruction::Or) {
	// Rebuild "or" as "add", because "or" may be invalid for the new
	// epxression.
	//
	// For instance, given
	// a \| (b + 5) where a and b + 5 have no common bits,
	// we can extract 5 as the constant offset.
	//
	// However, reusing the "or" in the new index would give us
	// (a \| b) + 5
	// which does not equal a \| (b + 5).
	//
	// Replacing the "or" with "add" is fine, because
	// a \| (b + 5) = a + (b + 5) = (a + b) + 5
	NewOp = Instruction::Add;
	}

	BinaryOperator *NewBO;
	if (OpNo == 0) {
	NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP);
	} else {
	NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP);
	}
	NewBO->takeName(BO);
	return NewBO;
	}

	Value ConstantOffsetExtractor::Extract(Value Idx, GetElementPtrInst *GEP,
	User *&UserChainTail,
	const DominatorTree *DT) {
	ConstantOffsetExtractor Extractor(GEP, DT);
	// Find a non-zero constant offset first.
	APInt ConstantOffset =
	Extractor.find(Idx, /* SignExtended / false, / ZeroExtended */ false,
	GEP->isInBounds());
	if (ConstantOffset == 0) {
	UserChainTail = nullptr;
	return nullptr;
	}
	// Separates the constant offset from the GEP index.
	Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
	UserChainTail = Extractor.UserChain.back();
	return IdxWithoutConstOffset;
	}

	int64_t ConstantOffsetExtractor::Find(Value Idx, GetElementPtrInst GEP,
	const DominatorTree *DT) {
	// If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
	return ConstantOffsetExtractor(GEP, DT)
	.find(Idx, /* SignExtended / false, / ZeroExtended */ false,
	GEP->isInBounds())
	.getSExtValue();
	}

	bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
	GetElementPtrInst *GEP) {
	bool Changed = false;
	Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
	gep_type_iterator GTI = gep_type_begin(*GEP);
	for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
	I != E; ++I, ++GTI) {
	// Skip struct member indices which must be i32.
	if (GTI.isSequential()) {
	if ((*I)->getType() != IntPtrTy) {
	I = CastInst::CreateIntegerCast(I, IntPtrTy, true, "idxprom", GEP);
	Changed = true;
	}
	}
	}
	return Changed;
	}

	int64_t
	SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
	bool &NeedsExtraction) {
	NeedsExtraction = false;
	int64_t AccumulativeByteOffset = 0;
	gep_type_iterator GTI = gep_type_begin(*GEP);
	for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
	if (GTI.isSequential()) {
	// Tries to extract a constant offset from this GEP index.
	int64_t ConstantOffset =
	ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
	if (ConstantOffset != 0) {
	NeedsExtraction = true;
	// A GEP may have multiple indices. We accumulate the extracted
	// constant offset to a byte offset, and later offset the remainder of
	// the original GEP with this byte offset.
	AccumulativeByteOffset +=
	ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
	}
	} else if (LowerGEP) {
	StructType *StTy = GTI.getStructType();
	uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
	// Skip field 0 as the offset is always 0.
	if (Field != 0) {
	NeedsExtraction = true;
	AccumulativeByteOffset +=
	DL->getStructLayout(StTy)->getElementOffset(Field);
	}
	}
	}
	return AccumulativeByteOffset;
	}

	void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
	GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
	IRBuilder<> Builder(Variadic);
	Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());

	Type *I8PtrTy =
	Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
	Value *ResultPtr = Variadic->getOperand(0);
	Loop *L = LI->getLoopFor(Variadic->getParent());
	// Check if the base is not loop invariant or used more than once.
	bool isSwapCandidate =
	L && L->isLoopInvariant(ResultPtr) &&
	!hasMoreThanOneUseInLoop(ResultPtr, L);
	Value *FirstResult = nullptr;

	if (ResultPtr->getType() != I8PtrTy)
	ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);

	gep_type_iterator GTI = gep_type_begin(*Variadic);
	// Create an ugly GEP for each sequential index. We don't create GEPs for
	// structure indices, as they are accumulated in the constant offset index.
	for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
	if (GTI.isSequential()) {
	Value *Idx = Variadic->getOperand(I);
	// Skip zero indices.
	if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
	if (CI->isZero())
	continue;

	APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
	DL->getTypeAllocSize(GTI.getIndexedType()));
	// Scale the index by element size.
	if (ElementSize != 1) {
	if (ElementSize.isPowerOf2()) {
	Idx = Builder.CreateShl(
	Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
	} else {
	Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
	}
	}
	// Create an ugly GEP with a single index for each index.
	ResultPtr =
	Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
	if (FirstResult == nullptr)
	FirstResult = ResultPtr;
	}
	}

	// Create a GEP with the constant offset index.
	if (AccumulativeByteOffset != 0) {
	Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
	ResultPtr =
	Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
	} else
	isSwapCandidate = false;

	// If we created a GEP with constant index, and the base is loop invariant,
	// then we swap the first one with it, so LICM can move constant GEP out
	// later.
	GetElementPtrInst *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
	GetElementPtrInst *SecondGEP = dyn_cast_or_null<GetElementPtrInst>(ResultPtr);
	if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
	swapGEPOperand(FirstGEP, SecondGEP);

	if (ResultPtr->getType() != Variadic->getType())
	ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());

	Variadic->replaceAllUsesWith(ResultPtr);
	Variadic->eraseFromParent();
	}

	void
	SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
	int64_t AccumulativeByteOffset) {
	IRBuilder<> Builder(Variadic);
	Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());

	Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
	gep_type_iterator GTI = gep_type_begin(*Variadic);
	// Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
	// don't create arithmetics for structure indices, as they are accumulated
	// in the constant offset index.
	for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
	if (GTI.isSequential()) {
	Value *Idx = Variadic->getOperand(I);
	// Skip zero indices.
	if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
	if (CI->isZero())
	continue;

	APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
	DL->getTypeAllocSize(GTI.getIndexedType()));
	// Scale the index by element size.
	if (ElementSize != 1) {
	if (ElementSize.isPowerOf2()) {
	Idx = Builder.CreateShl(
	Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
	} else {
	Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
	}
	}
	// Create an ADD for each index.
	ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
	}
	}

	// Create an ADD for the constant offset index.
	if (AccumulativeByteOffset != 0) {
	ResultPtr = Builder.CreateAdd(
	ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
	}

	ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
	Variadic->replaceAllUsesWith(ResultPtr);
	Variadic->eraseFromParent();
	}

	bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
	// Skip vector GEPs.
	if (GEP->getType()->isVectorTy())
	return false;

	// The backend can already nicely handle the case where all indices are
	// constant.
	if (GEP->hasAllConstantIndices())
	return false;

	bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);

	bool NeedsExtraction;
	int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);

	if (!NeedsExtraction)
	return Changed;
	// If LowerGEP is disabled, before really splitting the GEP, check whether the
	// backend supports the addressing mode we are about to produce. If no, this
	// splitting probably won't be beneficial.
	// If LowerGEP is enabled, even the extracted constant offset can not match
	// the addressing mode, we can still do optimizations to other lowered parts
	// of variable indices. Therefore, we don't check for addressing modes in that
	// case.
	if (!LowerGEP) {
	TargetTransformInfo &TTI =
	getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
	*GEP->getParent()->getParent());
	unsigned AddrSpace = GEP->getPointerAddressSpace();
	if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
	/BaseGV=/nullptr, AccumulativeByteOffset,
	/HasBaseReg=/true, /Scale=/0,
	AddrSpace)) {
	return Changed;
	}
	}

	// Remove the constant offset in each sequential index. The resultant GEP
	// computes the variadic base.
	// Notice that we don't remove struct field indices here. If LowerGEP is
	// disabled, a structure index is not accumulated and we still use the old
	// one. If LowerGEP is enabled, a structure index is accumulated in the
	// constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
	// handle the constant offset and won't need a new structure index.
	gep_type_iterator GTI = gep_type_begin(*GEP);
	for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
	if (GTI.isSequential()) {
	// Splits this GEP index into a variadic part and a constant offset, and
	// uses the variadic part as the new index.
	Value *OldIdx = GEP->getOperand(I);
	User *UserChainTail;
	Value *NewIdx =
	ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
	if (NewIdx != nullptr) {
	// Switches to the index with the constant offset removed.
	GEP->setOperand(I, NewIdx);
	// After switching to the new index, we can garbage-collect UserChain
	// and the old index if they are not used.
	RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
	RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
	}
	}
	}

	// Clear the inbounds attribute because the new index may be off-bound.
	// e.g.,
	//
	// b = add i64 a, 5
	// addr = gep inbounds float, float* p, i64 b
	//
	// is transformed to:
	//
	// addr2 = gep float, float* p, i64 a ; inbounds removed
	// addr = gep inbounds float, float* addr2, i64 5
	//
	// If a is -4, although the old index b is in bounds, the new index a is
	// off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
	// inbounds keyword is not present, the offsets are added to the base
	// address with silently-wrapping two's complement arithmetic".
	// Therefore, the final code will be a semantically equivalent.
	//
	// TODO(jingyue): do some range analysis to keep as many inbounds as
	// possible. GEPs with inbounds are more friendly to alias analysis.
	bool GEPWasInBounds = GEP->isInBounds();
	GEP->setIsInBounds(false);

	// Lowers a GEP to either GEPs with a single index or arithmetic operations.
	if (LowerGEP) {
	// As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
	// arithmetic operations if the target uses alias analysis in codegen.
	if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA())
	lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
	else
	lowerToArithmetics(GEP, AccumulativeByteOffset);
	return true;
	}

	// No need to create another GEP if the accumulative byte offset is 0.
	if (AccumulativeByteOffset == 0)
	return true;

	// Offsets the base with the accumulative byte offset.
	//
	// %gep ; the base
	// ... %gep ...
	//
	// => add the offset
	//
	// %gep2 ; clone of %gep
	// %new.gep = gep %gep2, <offset / sizeof(*%gep)>
	// %gep ; will be removed
	// ... %gep ...
	//
	// => replace all uses of %gep with %new.gep and remove %gep
	//
	// %gep2 ; clone of %gep
	// %new.gep = gep %gep2, <offset / sizeof(*%gep)>
	// ... %new.gep ...
	//
	// If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
	// uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
	// bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
	// type of %gep.
	//
	// %gep2 ; clone of %gep
	// %0 = bitcast %gep2 to i8*
	// %uglygep = gep %0, <offset>
	// %new.gep = bitcast %uglygep to <type of %gep>
	// ... %new.gep ...
	Instruction *NewGEP = GEP->clone();
	NewGEP->insertBefore(GEP);

	// Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
	// unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
	// used with unsigned integers later.
	int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
	DL->getTypeAllocSize(GEP->getResultElementType()));
	Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
	if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
	// Very likely. As long as %gep is natually aligned, the byte offset we
	// extracted should be a multiple of sizeof(*%gep).
	int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
	NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
	ConstantInt::get(IntPtrTy, Index, true),
	GEP->getName(), GEP);
	+ NewGEP->copyMetadata(*GEP);
	// Inherit the inbounds attribute of the original GEP.
	cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
	} else {
	// Unlikely but possible. For example,
	// #pragma pack(1)
	// struct S {
	// int a[3];
	// int64 b[8];
	// };
	// #pragma pack()
	//
	// Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
	// extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
	// sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
	// sizeof(int64).
	//
	// Emit an uglygep in this case.
	Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
	GEP->getPointerAddressSpace());
	NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
	NewGEP = GetElementPtrInst::Create(
	Type::getInt8Ty(GEP->getContext()), NewGEP,
	ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
	GEP);
	+ NewGEP->copyMetadata(*GEP);
	// Inherit the inbounds attribute of the original GEP.
	cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
	if (GEP->getType() != I8PtrTy)
	NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
	}

	GEP->replaceAllUsesWith(NewGEP);
	GEP->eraseFromParent();

	return true;
	}

	bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
	if (skipFunction(F))
	return false;

	if (DisableSeparateConstOffsetFromGEP)
	return false;

	DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
	SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
	LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
	TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
	bool Changed = false;
	for (BasicBlock &B : F) {
	for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
	if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
	Changed \|= splitGEP(GEP);
	// No need to split GEP ConstantExprs because all its indices are constant
	// already.
	}

	Changed \|= reuniteExts(F);

	if (VerifyNoDeadCode)
	verifyNoDeadCode(F);

	return Changed;
	}

	Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
	const SCEV Key, Instruction Dominatee) {
	auto Pos = DominatingExprs.find(Key);
	if (Pos == DominatingExprs.end())
	return nullptr;

	auto &Candidates = Pos->second;
	// Because we process the basic blocks in pre-order of the dominator tree, a
	// candidate that doesn't dominate the current instruction won't dominate any
	// future instruction either. Therefore, we pop it out of the stack. This
	// optimization makes the algorithm O(n).
	while (!Candidates.empty()) {
	Instruction *Candidate = Candidates.back();
	if (DT->dominates(Candidate, Dominatee))
	return Candidate;
	Candidates.pop_back();
	}
	return nullptr;
	}

	bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
	if (!SE->isSCEVable(I->getType()))
	return false;

	// Dom: LHS+RHS
	// I: sext(LHS)+sext(RHS)
	// If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
	// TODO: handle zext
	Value LHS = nullptr, RHS = nullptr;
	if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) \|\|
	match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
	if (LHS->getType() == RHS->getType()) {
	const SCEV *Key =
	SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
	if (auto *Dom = findClosestMatchingDominator(Key, I)) {
	Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
	NewSExt->takeName(I);
	I->replaceAllUsesWith(NewSExt);
	RecursivelyDeleteTriviallyDeadInstructions(I);
	return true;
	}
	}
	}

	// Add I to DominatingExprs if it's an add/sub that can't sign overflow.
	if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) \|\|
	match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
	if (programUndefinedIfFullPoison(I)) {
	const SCEV *Key =
	SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
	DominatingExprs[Key].push_back(I);
	}
	}
	return false;
	}

	bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
	bool Changed = false;
	DominatingExprs.clear();
	for (const auto Node : depth_first(DT)) {
	BasicBlock *BB = Node->getBlock();
	for (auto I = BB->begin(); I != BB->end(); ) {
	Instruction Cur = &I++;
	Changed \|= reuniteExts(Cur);
	}
	}
	return Changed;
	}

	void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
	for (BasicBlock &B : F) {
	for (Instruction &I : B) {
	if (isInstructionTriviallyDead(&I)) {
	std::string ErrMessage;
	raw_string_ostream RSO(ErrMessage);
	RSO << "Dead instruction detected!\n" << I << "\n";
	llvm_unreachable(RSO.str().c_str());
	}
	}
	}
	}

	bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
	GetElementPtrInst FirstGEP, GetElementPtrInst SecondGEP, Loop *CurLoop) {
	if (!FirstGEP \|\| !FirstGEP->hasOneUse())
	return false;

	if (!SecondGEP \|\| FirstGEP->getParent() != SecondGEP->getParent())
	return false;

	if (FirstGEP == SecondGEP)
	return false;

	unsigned FirstNum = FirstGEP->getNumOperands();
	unsigned SecondNum = SecondGEP->getNumOperands();
	// Give up if the number of operands are not 2.
	if (FirstNum != SecondNum \|\| FirstNum != 2)
	return false;

	Value *FirstBase = FirstGEP->getOperand(0);
	Value *SecondBase = SecondGEP->getOperand(0);
	Value *FirstOffset = FirstGEP->getOperand(1);
	// Give up if the index of the first GEP is loop invariant.
	if (CurLoop->isLoopInvariant(FirstOffset))
	return false;

	// Give up if base doesn't have same type.
	if (FirstBase->getType() != SecondBase->getType())
	return false;

	Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);

	// Check if the second operand of first GEP has constant coefficient.
	// For an example, for the following code, we won't gain anything by
	// hoisting the second GEP out because the second GEP can be folded away.
	// %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
	// %67 = shl i64 %scevgep.sum.ur159, 2
	// %uglygep160 = getelementptr i8* %65, i64 %67
	// %uglygep161 = getelementptr i8* %uglygep160, i64 -1024

	// Skip constant shift instruction which may be generated by Splitting GEPs.
	if (FirstOffsetDef && FirstOffsetDef->isShift() &&
	isa<ConstantInt>(FirstOffsetDef->getOperand(1)))
	FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));

	// Give up if FirstOffsetDef is an Add or Sub with constant.
	// Because it may not profitable at all due to constant folding.
	if (FirstOffsetDef)
	if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
	unsigned opc = BO->getOpcode();
	if ((opc == Instruction::Add \|\| opc == Instruction::Sub) &&
	(isa<ConstantInt>(BO->getOperand(0)) \|\|
	isa<ConstantInt>(BO->getOperand(1))))
	return false;
	}
	return true;
	}

	bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value V, Loop L) {
	int UsesInLoop = 0;
	for (User *U : V->users()) {
	if (Instruction *User = dyn_cast<Instruction>(U))
	if (L->contains(User))
	if (++UsesInLoop > 1)
	return true;
	}
	return false;
	}

	void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
	GetElementPtrInst *Second) {
	Value *Offset1 = First->getOperand(1);
	Value *Offset2 = Second->getOperand(1);
	First->setOperand(1, Offset2);
	Second->setOperand(1, Offset1);

	// We changed p+o+c to p+c+o, p+c may not be inbound anymore.
	const DataLayout &DAL = First->getModule()->getDataLayout();
	APInt Offset(DAL.getPointerSizeInBits(
	cast<PointerType>(First->getType())->getAddressSpace()),
	0);
	Value *NewBase =
	First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
	uint64_t ObjectSize;
	if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) \|\|
	Offset.ugt(ObjectSize)) {
	First->setIsInBounds(false);
	Second->setIsInBounds(false);
	} else
	First->setIsInBounds(true);
	}
	Index: head/contrib/llvm/tools/clang/include/clang/Driver/Options.td
	===================================================================
	--- head/contrib/llvm/tools/clang/include/clang/Driver/Options.td (revision 328816)
	+++ head/contrib/llvm/tools/clang/include/clang/Driver/Options.td (revision 328817)
	@@ -1,2791 +1,2795 @@
	//===--- Options.td - Options for clang -----------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the options accepted by clang.
	//
	//===----------------------------------------------------------------------===//

	// Include the common option parsing interfaces.
	include "llvm/Option/OptParser.td"

	/////////
	// Flags

	// DriverOption - The option is a "driver" option, and should not be forwarded
	// to other tools.
	def DriverOption : OptionFlag;

	// LinkerInput - The option is a linker input.
	def LinkerInput : OptionFlag;

	// NoArgumentUnused - Don't report argument unused warnings for this option; this
	// is useful for options like -static or -dynamic which a user may always end up
	// passing, even if the platform defaults to (or only supports) that option.
	def NoArgumentUnused : OptionFlag;

	// Unsupported - The option is unsupported, and the driver will reject command
	// lines that use it.
	def Unsupported : OptionFlag;

	// Ignored - The option is unsupported, and the driver will silently ignore it.
	def Ignored : OptionFlag;

	// CoreOption - This is considered a "core" Clang option, available in both
	// clang and clang-cl modes.
	def CoreOption : OptionFlag;

	// CLOption - This is a cl.exe compatibility option. Options with this flag
	// are made available when the driver is running in CL compatibility mode.
	def CLOption : OptionFlag;

	// CC1Option - This option should be accepted by clang -cc1.
	def CC1Option : OptionFlag;

	// CC1AsOption - This option should be accepted by clang -cc1as.
	def CC1AsOption : OptionFlag;

	// NoDriverOption - This option should not be accepted by the driver.
	def NoDriverOption : OptionFlag;

	// A short name to show in documentation. The name will be interpreted as rST.
	class DocName<string name> { string DocName = name; }

	// A brief description to show in documentation, interpreted as rST.
	class DocBrief<code descr> { code DocBrief = descr; }

	// Indicates that this group should be flattened into its parent when generating
	// documentation.
	class DocFlatten { bit DocFlatten = 1; }

	// Indicates that this warning is ignored, but accepted with a warning for
	// GCC compatibility.
	class IgnoredGCCCompat : Flags<[HelpHidden]> {}

	/////////
	// Groups

	def Action_Group : OptionGroup<"<action group>">, DocName<"Actions">,
	DocBrief<[{The action to perform on the input.}]>;

	// Meta-group for options which are only used for compilation,
	// and not linking etc.
	def CompileOnly_Group : OptionGroup<"<CompileOnly group>">,
	DocName<"Compilation flags">, DocBrief<[{
	Flags controlling the behavior of Clang during compilation. These flags have
	no effect during actions that do not perform compilation.}]>;

	def Preprocessor_Group : OptionGroup<"<Preprocessor group>">,
	Group<CompileOnly_Group>,
	DocName<"Preprocessor flags">, DocBrief<[{
	Flags controlling the behavior of the Clang preprocessor.}]>;

	def IncludePath_Group : OptionGroup<"<I/i group>">, Group<Preprocessor_Group>,
	DocName<"Include path management">,
	DocBrief<[{
	Flags controlling how ``#include``\s are resolved to files.}]>;

	def I_Group : OptionGroup<"<I group>">, Group<IncludePath_Group>, DocFlatten;
	def i_Group : OptionGroup<"<i group>">, Group<IncludePath_Group>, DocFlatten;
	def clang_i_Group : OptionGroup<"<clang i group>">, Group<i_Group>, DocFlatten;

	def M_Group : OptionGroup<"<M group>">, Group<Preprocessor_Group>,
	DocName<"Dependency file generation">, DocBrief<[{
	Flags controlling generation of a dependency file for ``make``-like build
	systems.}]>;

	def d_Group : OptionGroup<"<d group>">, Group<Preprocessor_Group>,
	DocName<"Dumping preprocessor state">, DocBrief<[{
	Flags allowing the state of the preprocessor to be dumped in various ways.}]>;

	def Diag_Group : OptionGroup<"<W/R group>">, Group<CompileOnly_Group>,
	DocName<"Diagnostic flags">, DocBrief<[{
	Flags controlling which warnings, errors, and remarks Clang will generate.
	See the :doc:`full list of warning and remark flags <DiagnosticsReference>`.}]>;

	def R_Group : OptionGroup<"<R group>">, Group<Diag_Group>, DocFlatten;
	def R_value_Group : OptionGroup<"<R (with value) group>">, Group<R_Group>,
	DocFlatten;
	def W_Group : OptionGroup<"<W group>">, Group<Diag_Group>, DocFlatten;
	def W_value_Group : OptionGroup<"<W (with value) group>">, Group<W_Group>,
	DocFlatten;

	def f_Group : OptionGroup<"<f group>">, Group<CompileOnly_Group>,
	DocName<"Target-independent compilation options">;

	def f_clang_Group : OptionGroup<"<f (clang-only) group>">,
	Group<CompileOnly_Group>, DocFlatten;
	def pedantic_Group : OptionGroup<"<pedantic group>">, Group<f_Group>,
	DocFlatten;
	def opencl_Group : OptionGroup<"<opencl group>">, Group<f_Group>,
	DocName<"OpenCL flags">;

	def m_Group : OptionGroup<"<m group>">, Group<CompileOnly_Group>,
	DocName<"Target-dependent compilation options">;

	// Feature groups - these take command line options that correspond directly to
	// target specific features and can be translated directly from command line
	// options.
	def m_aarch64_Features_Group : OptionGroup<"<aarch64 features group>">,
	Group<m_Group>, DocName<"AARCH64">;
	def m_amdgpu_Features_Group : OptionGroup<"<amdgpu features group>">,
	Group<m_Group>, DocName<"AMDGPU">;
	def m_arm_Features_Group : OptionGroup<"<arm features group>">,
	Group<m_Group>, DocName<"ARM">;
	def m_hexagon_Features_Group : OptionGroup<"<hexagon features group>">,
	Group<m_Group>, DocName<"Hexagon">;
	// The features added by this group will not be added to target features.
	// These are explicitly handled.
	def m_hexagon_Features_HVX_Group : OptionGroup<"<hexagon features group>">,
	Group<m_Group>, DocName<"Hexagon">;
	def m_ppc_Features_Group : OptionGroup<"<ppc features group>">,
	Group<m_Group>, DocName<"PowerPC">;
	def m_wasm_Features_Group : OptionGroup<"<wasm features group>">,
	Group<m_Group>, DocName<"WebAssembly">;
	def m_x86_Features_Group : OptionGroup<"<x86 features group>">,
	Group<m_Group>, Flags<[CoreOption]>, DocName<"X86">;

	def m_libc_Group : OptionGroup<"<m libc group>">, Group<m_Group>,
	Flags<[HelpHidden]>;

	def O_Group : OptionGroup<"<O group>">, Group<CompileOnly_Group>,
	DocName<"Optimization level">, DocBrief<[{
	Flags controlling how much optimization should be performed.}]>;

	def DebugInfo_Group : OptionGroup<"<g group>">, Group<CompileOnly_Group>,
	DocName<"Debug information generation">, DocBrief<[{
	Flags controlling how much and what kind of debug information should be
	generated.}]>;

	def g_Group : OptionGroup<"<g group>">, Group<DebugInfo_Group>,
	DocName<"Kind and level of debug information">;
	def gN_Group : OptionGroup<"<gN group>">, Group<g_Group>,
	DocName<"Debug level">;
	def ggdbN_Group : OptionGroup<"<ggdbN group>">, Group<gN_Group>, DocFlatten;
	def gTune_Group : OptionGroup<"<gTune group>">, Group<g_Group>,
	DocName<"Debugger to tune debug information for">;
	def g_flags_Group : OptionGroup<"<g flags group>">, Group<DebugInfo_Group>,
	DocName<"Debug information flags">;

	def StaticAnalyzer_Group : OptionGroup<"<Static analyzer group>">,
	DocName<"Static analyzer flags">, DocBrief<[{
	Flags controlling the behavior of the Clang Static Analyzer.}]>;

	// gfortran options that we recognize in the driver and pass along when
	// invoking GCC to compile Fortran code.
	def gfortran_Group : OptionGroup<"<gfortran group>">,
	DocName<"Fortran compilation flags">, DocBrief<[{
	Flags that will be passed onto the ``gfortran`` compiler when Clang is given
	a Fortran input.}]>;

	def Link_Group : OptionGroup<"<T/e/s/t/u group>">, DocName<"Linker flags">,
	DocBrief<[{Flags that are passed on to the linker}]>;
	def T_Group : OptionGroup<"<T group>">, Group<Link_Group>, DocFlatten;
	def u_Group : OptionGroup<"<u group>">, Group<Link_Group>, DocFlatten;

	def reserved_lib_Group : OptionGroup<"<reserved libs group>">,
	Flags<[Unsupported]>;

	// Temporary groups for clang options which we know we don't support,
	// but don't want to verbosely warn the user about.
	def clang_ignored_f_Group : OptionGroup<"<clang ignored f group>">,
	Group<f_Group>, Flags<[Ignored]>;
	def clang_ignored_m_Group : OptionGroup<"<clang ignored m group>">,
	Group<m_Group>, Flags<[Ignored]>;

	// Group for clang options in the process of deprecation.
	// Please include the version that deprecated the flag as comment to allow
	// easier garbage collection.
	def clang_ignored_legacy_options_Group : OptionGroup<"<clang legacy flags>">,
	Group<f_Group>, Flags<[Ignored]>;

	// Retired with clang-5.0
	def : Flag<["-"], "fslp-vectorize-aggressive">, Group<clang_ignored_legacy_options_Group>;
	def : Flag<["-"], "fno-slp-vectorize-aggressive">, Group<clang_ignored_legacy_options_Group>;

	// Group that ignores all gcc optimizations that won't be implemented
	def clang_ignored_gcc_optimization_f_Group : OptionGroup<
	"<clang_ignored_gcc_optimization_f_Group>">, Group<f_Group>, Flags<[Ignored]>;

	/////////
	// Options

	// The internal option ID must be a valid C++ identifier and results in a
	// clang::driver::options::OPT_XX enum constant for XX.
	//
	// We want to unambiguously be able to refer to options from the driver source
	// code, for this reason the option name is mangled into an ID. This mangling
	// isn't guaranteed to have an inverse, but for practical purposes it does.
	//
	// The mangling scheme is to ignore the leading '-', and perform the following
	// substitutions:
	// _ => __
	// - => _
	// / => _SLASH
	// # => _HASH
	// ? => _QUESTION
	// , => _COMMA
	// = => _EQ
	// C++ => CXX
	// . => _

	// Developer Driver Options

	def internal_Group : OptionGroup<"<clang internal options>">, Flags<[HelpHidden]>;
	def internal_driver_Group : OptionGroup<"<clang driver internal options>">,
	Group<internal_Group>, HelpText<"DRIVER OPTIONS">;
	def internal_debug_Group :
	OptionGroup<"<clang debug/development internal options>">,
	Group<internal_Group>, HelpText<"DEBUG/DEVELOPMENT OPTIONS">;

	class InternalDriverOpt : Group<internal_driver_Group>,
	Flags<[DriverOption, HelpHidden]>;
	def driver_mode : Joined<["--"], "driver-mode=">, Group<internal_driver_Group>,
	Flags<[CoreOption, DriverOption, HelpHidden]>,
	HelpText<"Set the driver mode to either 'gcc', 'g++', 'cpp', or 'cl'">;
	def rsp_quoting : Joined<["--"], "rsp-quoting=">, Group<internal_driver_Group>,
	Flags<[CoreOption, DriverOption, HelpHidden]>,
	HelpText<"Set the rsp quoting to either 'posix', or 'windows'">;
	def ccc_gcc_name : Separate<["-"], "ccc-gcc-name">, InternalDriverOpt,
	HelpText<"Name for native GCC compiler">,
	MetaVarName<"<gcc-path>">;
	def ccc_pch_is_pch : Flag<["-"], "ccc-pch-is-pch">, InternalDriverOpt,
	HelpText<"Use lazy PCH for precompiled headers">;
	def ccc_pch_is_pth : Flag<["-"], "ccc-pch-is-pth">, InternalDriverOpt,
	HelpText<"Use pretokenized headers for precompiled headers">;

	class InternalDebugOpt : Group<internal_debug_Group>,
	Flags<[DriverOption, HelpHidden, CoreOption]>;
	def ccc_install_dir : Separate<["-"], "ccc-install-dir">, InternalDebugOpt,
	HelpText<"Simulate installation in the given directory">;
	def ccc_print_phases : Flag<["-"], "ccc-print-phases">, InternalDebugOpt,
	HelpText<"Dump list of actions to perform">;
	def ccc_print_bindings : Flag<["-"], "ccc-print-bindings">, InternalDebugOpt,
	HelpText<"Show bindings of tools to actions">;

	def ccc_arcmt_check : Flag<["-"], "ccc-arcmt-check">, InternalDriverOpt,
	HelpText<"Check for ARC migration issues that need manual handling">;
	def ccc_arcmt_modify : Flag<["-"], "ccc-arcmt-modify">, InternalDriverOpt,
	HelpText<"Apply modifications to files to conform to ARC">;
	def ccc_arcmt_migrate : Separate<["-"], "ccc-arcmt-migrate">, InternalDriverOpt,
	HelpText<"Apply modifications and produces temporary files that conform to ARC">;
	def arcmt_migrate_report_output : Separate<["-"], "arcmt-migrate-report-output">,
	HelpText<"Output path for the plist report">, Flags<[CC1Option]>;
	def arcmt_migrate_emit_arc_errors : Flag<["-"], "arcmt-migrate-emit-errors">,
	HelpText<"Emit ARC errors even if the migrator can fix them">,
	Flags<[CC1Option]>;
	def gen_reproducer: Flag<["-"], "gen-reproducer">, InternalDebugOpt,
	HelpText<"Auto-generates preprocessed source files and a reproduction script">;

	def _migrate : Flag<["--"], "migrate">, Flags<[DriverOption]>,
	HelpText<"Run the migrator">;
	def ccc_objcmt_migrate : Separate<["-"], "ccc-objcmt-migrate">,
	InternalDriverOpt,
	HelpText<"Apply modifications and produces temporary files to migrate to "
	"modern ObjC syntax">;
	def objcmt_migrate_literals : Flag<["-"], "objcmt-migrate-literals">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC literals">;
	def objcmt_migrate_subscripting : Flag<["-"], "objcmt-migrate-subscripting">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC subscripting">;
	def objcmt_migrate_property : Flag<["-"], "objcmt-migrate-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC property">;
	def objcmt_migrate_all : Flag<["-"], "objcmt-migrate-all">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC">;
	def objcmt_migrate_readonly_property : Flag<["-"], "objcmt-migrate-readonly-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC readonly property">;
	def objcmt_migrate_readwrite_property : Flag<["-"], "objcmt-migrate-readwrite-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC readwrite property">;
	def objcmt_migrate_property_dot_syntax : Flag<["-"], "objcmt-migrate-property-dot-syntax">, Flags<[CC1Option]>,
	HelpText<"Enable migration of setter/getter messages to property-dot syntax">;
	def objcmt_migrate_annotation : Flag<["-"], "objcmt-migrate-annotation">, Flags<[CC1Option]>,
	HelpText<"Enable migration to property and method annotations">;
	def objcmt_migrate_instancetype : Flag<["-"], "objcmt-migrate-instancetype">, Flags<[CC1Option]>,
	HelpText<"Enable migration to infer instancetype for method result type">;
	def objcmt_migrate_nsmacros : Flag<["-"], "objcmt-migrate-ns-macros">, Flags<[CC1Option]>,
	HelpText<"Enable migration to NS_ENUM/NS_OPTIONS macros">;
	def objcmt_migrate_protocol_conformance : Flag<["-"], "objcmt-migrate-protocol-conformance">, Flags<[CC1Option]>,
	HelpText<"Enable migration to add protocol conformance on classes">;
	def objcmt_atomic_property : Flag<["-"], "objcmt-atomic-property">, Flags<[CC1Option]>,
	HelpText<"Make migration to 'atomic' properties">;
	def objcmt_returns_innerpointer_property : Flag<["-"], "objcmt-returns-innerpointer-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to annotate property with NS_RETURNS_INNER_POINTER">;
	def objcmt_ns_nonatomic_iosonly: Flag<["-"], "objcmt-ns-nonatomic-iosonly">, Flags<[CC1Option]>,
	HelpText<"Enable migration to use NS_NONATOMIC_IOSONLY macro for setting property's 'atomic' attribute">;
	def objcmt_migrate_designated_init : Flag<["-"], "objcmt-migrate-designated-init">, Flags<[CC1Option]>,
	HelpText<"Enable migration to infer NS_DESIGNATED_INITIALIZER for initializer methods">;
	def objcmt_whitelist_dir_path: Joined<["-"], "objcmt-whitelist-dir-path=">, Flags<[CC1Option]>,
	HelpText<"Only modify files with a filename contained in the provided directory path">;
	// The misspelt "white-list" [sic] alias is due for removal.
	def : Joined<["-"], "objcmt-white-list-dir-path=">, Flags<[CC1Option]>,
	Alias<objcmt_whitelist_dir_path>;

	// Make sure all other -ccc- options are rejected.
	def ccc_ : Joined<["-"], "ccc-">, Group<internal_Group>, Flags<[Unsupported]>;

	// Standard Options

	def _HASH_HASH_HASH : Flag<["-"], "###">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Print (but do not run) the commands to run for this compilation">;
	def _DASH_DASH : Option<["--"], "", KIND_REMAINING_ARGS>,
	Flags<[DriverOption, CoreOption]>;
	def A : JoinedOrSeparate<["-"], "A">, Flags<[RenderJoined]>, Group<gfortran_Group>;
	def B : JoinedOrSeparate<["-"], "B">, MetaVarName<"<dir>">,
	HelpText<"Add <dir> to search path for binaries and object files used implicitly">;
	def CC : Flag<["-"], "CC">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Include comments from within macros in preprocessed output">;
	def C : Flag<["-"], "C">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Include comments in preprocessed output">;
	def D : JoinedOrSeparate<["-"], "D">, Group<Preprocessor_Group>,
	Flags<[CC1Option]>, MetaVarName<"<macro>=<value>">,
	HelpText<"Define <macro> to <value> (or 1 if <value> omitted)">;
	def E : Flag<["-"], "E">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
	HelpText<"Only run the preprocessor">;
	def F : JoinedOrSeparate<["-"], "F">, Flags<[RenderJoined,CC1Option]>,
	HelpText<"Add directory to framework include search path">;
	def G : JoinedOrSeparate<["-"], "G">, Flags<[DriverOption]>, Group<m_Group>,
	MetaVarName<"<size>">, HelpText<"Put objects of at most <size> bytes "
	"into small data section (MIPS / Hexagon)">;
	def G_EQ : Joined<["-"], "G=">, Flags<[DriverOption]>, Group<m_Group>, Alias<G>;
	def H : Flag<["-"], "H">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Show header includes and nesting depth">;
	def I_ : Flag<["-"], "I-">, Group<I_Group>,
	HelpText<"Restrict all prior -I flags to double-quoted inclusion and "
	"remove current directory from include path">;
	def I : JoinedOrSeparate<["-"], "I">, Group<I_Group>,
	Flags<[CC1Option,CC1AsOption]>, MetaVarName<"<dir>">,
	HelpText<"Add directory to include search path">;
	def L : JoinedOrSeparate<["-"], "L">, Flags<[RenderJoined]>, Group<Link_Group>,
	MetaVarName<"<dir>">, HelpText<"Add directory to library search path">;
	def MD : Flag<["-"], "MD">, Group<M_Group>,
	HelpText<"Write a depfile containing user and system headers">;
	def MMD : Flag<["-"], "MMD">, Group<M_Group>,
	HelpText<"Write a depfile containing user headers">;
	def M : Flag<["-"], "M">, Group<M_Group>,
	HelpText<"Like -MD, but also implies -E and writes to stdout by default">;
	def MM : Flag<["-"], "MM">, Group<M_Group>,
	HelpText<"Like -MMD, but also implies -E and writes to stdout by default">;
	def MF : JoinedOrSeparate<["-"], "MF">, Group<M_Group>,
	HelpText<"Write depfile output from -MMD, -MD, -MM, or -M to <file>">,
	MetaVarName<"<file>">;
	def MG : Flag<["-"], "MG">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Add missing headers to depfile">;
	def MJ : JoinedOrSeparate<["-"], "MJ">, Group<M_Group>,
	HelpText<"Write a compilation database entry per input">;
	def MP : Flag<["-"], "MP">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Create phony target for each dependency (other than main file)">;
	def MQ : JoinedOrSeparate<["-"], "MQ">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Specify name of main file output to quote in depfile">;
	def MT : JoinedOrSeparate<["-"], "MT">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Specify name of main file output in depfile">;
	def MV : Flag<["-"], "MV">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Use NMake/Jom format for the depfile">;
	def Mach : Flag<["-"], "Mach">, Group<Link_Group>;
	def O0 : Flag<["-"], "O0">, Group<O_Group>, Flags<[CC1Option, HelpHidden]>;
	def O4 : Flag<["-"], "O4">, Group<O_Group>, Flags<[CC1Option, HelpHidden]>;
	def ObjCXX : Flag<["-"], "ObjC++">, Flags<[DriverOption]>,
	HelpText<"Treat source input files as Objective-C++ inputs">;
	def ObjC : Flag<["-"], "ObjC">, Flags<[DriverOption]>,
	HelpText<"Treat source input files as Objective-C inputs">;
	def O : Joined<["-"], "O">, Group<O_Group>, Flags<[CC1Option]>;
	def O_flag : Flag<["-"], "O">, Flags<[CC1Option]>, Alias<O>, AliasArgs<["2"]>;
	def Ofast : Joined<["-"], "Ofast">, Group<O_Group>, Flags<[CC1Option]>;
	def P : Flag<["-"], "P">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Disable linemarker output in -E mode">;
	def Qn : Flag<["-"], "Qn">, IgnoredGCCCompat;
	def Qunused_arguments : Flag<["-"], "Qunused-arguments">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Don't emit warning for unused driver arguments">;
	def Q : Flag<["-"], "Q">, IgnoredGCCCompat;
	def Rpass_EQ : Joined<["-"], "Rpass=">, Group<R_value_Group>, Flags<[CC1Option]>,
	HelpText<"Report transformations performed by optimization passes whose "
	"name matches the given POSIX regular expression">;
	def Rpass_missed_EQ : Joined<["-"], "Rpass-missed=">, Group<R_value_Group>,
	Flags<[CC1Option]>,
	HelpText<"Report missed transformations by optimization passes whose "
	"name matches the given POSIX regular expression">;
	def Rpass_analysis_EQ : Joined<["-"], "Rpass-analysis=">, Group<R_value_Group>,
	Flags<[CC1Option]>,
	HelpText<"Report transformation analysis from optimization passes whose "
	"name matches the given POSIX regular expression">;
	def R_Joined : Joined<["-"], "R">, Group<R_Group>, Flags<[CC1Option, CoreOption]>,
	MetaVarName<"<remark>">, HelpText<"Enable the specified remark">;
	def S : Flag<["-"], "S">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
	HelpText<"Only run preprocess and compilation steps">;
	def Tbss : JoinedOrSeparate<["-"], "Tbss">, Group<T_Group>,
	MetaVarName<"<addr>">, HelpText<"Set starting address of BSS to <addr>">;
	def Tdata : JoinedOrSeparate<["-"], "Tdata">, Group<T_Group>,
	MetaVarName<"<addr>">, HelpText<"Set starting address of BSS to <addr>">;
	def Ttext : JoinedOrSeparate<["-"], "Ttext">, Group<T_Group>,
	MetaVarName<"<addr>">, HelpText<"Set starting address of BSS to <addr>">;
	def T : JoinedOrSeparate<["-"], "T">, Group<T_Group>,
	MetaVarName<"<script>">, HelpText<"Specify <script> as linker script">;
	def U : JoinedOrSeparate<["-"], "U">, Group<Preprocessor_Group>,
	Flags<[CC1Option]>, MetaVarName<"<macro>">, HelpText<"Undefine macro <macro>">;
	def V : JoinedOrSeparate<["-"], "V">, Flags<[DriverOption, Unsupported]>;
	def Wa_COMMA : CommaJoined<["-"], "Wa,">,
	HelpText<"Pass the comma separated arguments in <arg> to the assembler">,
	MetaVarName<"<arg>">;
	def Wall : Flag<["-"], "Wall">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def WCL4 : Flag<["-"], "WCL4">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def Wdeprecated : Flag<["-"], "Wdeprecated">, Group<W_Group>, Flags<[CC1Option]>,
	HelpText<"Enable warnings for deprecated constructs and define __DEPRECATED">;
	def Wno_deprecated : Flag<["-"], "Wno-deprecated">, Group<W_Group>, Flags<[CC1Option]>;
	def Wl_COMMA : CommaJoined<["-"], "Wl,">, Flags<[LinkerInput, RenderAsInput]>,
	HelpText<"Pass the comma separated arguments in <arg> to the linker">,
	MetaVarName<"<arg>">, Group<Link_Group>;
	// FIXME: This is broken; these should not be Joined arguments.
	def Wno_nonportable_cfstrings : Joined<["-"], "Wno-nonportable-cfstrings">, Group<W_Group>,
	Flags<[CC1Option]>;
	def Wnonportable_cfstrings : Joined<["-"], "Wnonportable-cfstrings">, Group<W_Group>,
	Flags<[CC1Option]>;
	def Wp_COMMA : CommaJoined<["-"], "Wp,">,
	HelpText<"Pass the comma separated arguments in <arg> to the preprocessor">,
	MetaVarName<"<arg>">, Group<Preprocessor_Group>;
	def Wwrite_strings : Flag<["-"], "Wwrite-strings">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def Wno_write_strings : Flag<["-"], "Wno-write-strings">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def W_Joined : Joined<["-"], "W">, Group<W_Group>, Flags<[CC1Option, CoreOption]>,
	MetaVarName<"<warning>">, HelpText<"Enable the specified warning">;
	def Xanalyzer : Separate<["-"], "Xanalyzer">,
	HelpText<"Pass <arg> to the static analyzer">, MetaVarName<"<arg>">,
	Group<StaticAnalyzer_Group>;
	def Xarch__ : JoinedAndSeparate<["-"], "Xarch_">, Flags<[DriverOption]>;
	def Xassembler : Separate<["-"], "Xassembler">,
	HelpText<"Pass <arg> to the assembler">, MetaVarName<"<arg>">,
	Group<CompileOnly_Group>;
	def Xclang : Separate<["-"], "Xclang">,
	HelpText<"Pass <arg> to the clang compiler">, MetaVarName<"<arg>">,
	Flags<[DriverOption, CoreOption]>, Group<CompileOnly_Group>;
	def Xcuda_fatbinary : Separate<["-"], "Xcuda-fatbinary">,
	HelpText<"Pass <arg> to fatbinary invocation">, MetaVarName<"<arg>">;
	def Xcuda_ptxas : Separate<["-"], "Xcuda-ptxas">,
	HelpText<"Pass <arg> to the ptxas assembler">, MetaVarName<"<arg>">;
	def Xopenmp_target : Separate<["-"], "Xopenmp-target">,
	HelpText<"Pass <arg> to the target offloading toolchain.">, MetaVarName<"<arg>">;
	def Xopenmp_target_EQ : JoinedAndSeparate<["-"], "Xopenmp-target=">,
	HelpText<"Pass <arg> to the specified target offloading toolchain. The triple that identifies the toolchain must be provided after the equals sign.">, MetaVarName<"<arg>">;
	def z : Separate<["-"], "z">, Flags<[LinkerInput, RenderAsInput]>,
	HelpText<"Pass -z <arg> to the linker">, MetaVarName<"<arg>">,
	Group<Link_Group>;
	def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>,
	HelpText<"Pass <arg> to the linker">, MetaVarName<"<arg>">,
	Group<Link_Group>;
	def Xpreprocessor : Separate<["-"], "Xpreprocessor">, Group<Preprocessor_Group>,
	HelpText<"Pass <arg> to the preprocessor">, MetaVarName<"<arg>">;
	def X_Flag : Flag<["-"], "X">, Group<Link_Group>;
	def X_Joined : Joined<["-"], "X">, IgnoredGCCCompat;
	def Z_Flag : Flag<["-"], "Z">, Group<Link_Group>;
	// FIXME: All we do with this is reject it. Remove.
	def Z_Joined : Joined<["-"], "Z">;
	def all__load : Flag<["-"], "all_load">;
	def allowable__client : Separate<["-"], "allowable_client">;
	def ansi : Flag<["-", "--"], "ansi">;
	def arch__errors__fatal : Flag<["-"], "arch_errors_fatal">;
	def arch : Separate<["-"], "arch">, Flags<[DriverOption]>;
	def arch__only : Separate<["-"], "arch_only">;
	def a : Joined<["-"], "a">;
	def autocomplete : Joined<["--"], "autocomplete=">;
	def bind__at__load : Flag<["-"], "bind_at_load">;
	def bundle__loader : Separate<["-"], "bundle_loader">;
	def bundle : Flag<["-"], "bundle">;
	def b : JoinedOrSeparate<["-"], "b">, Flags<[Unsupported]>;
	def cl_opt_disable : Flag<["-"], "cl-opt-disable">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. This option disables all optimizations. By default optimizations are enabled.">;
	def cl_strict_aliasing : Flag<["-"], "cl-strict-aliasing">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. This option is added for compatibility with OpenCL 1.0.">;
	def cl_single_precision_constant : Flag<["-"], "cl-single-precision-constant">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Treat double precision floating-point constant as single precision constant.">;
	def cl_finite_math_only : Flag<["-"], "cl-finite-math-only">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.">;
	def cl_kernel_arg_info : Flag<["-"], "cl-kernel-arg-info">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Generate kernel argument metadata.">;
	def cl_unsafe_math_optimizations : Flag<["-"], "cl-unsafe-math-optimizations">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow unsafe floating-point optimizations. Also implies -cl-no-signed-zeros and -cl-mad-enable.">;
	def cl_fast_relaxed_math : Flag<["-"], "cl-fast-relaxed-math">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Sets -cl-finite-math-only and -cl-unsafe-math-optimizations, and defines __FAST_RELAXED_MATH__.">;
	def cl_mad_enable : Flag<["-"], "cl-mad-enable">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow use of less precise MAD computations in the generated binary.">;
	def cl_no_signed_zeros : Flag<["-"], "cl-no-signed-zeros">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow use of less precise no signed zeros computations in the generated binary.">;
	def cl_std_EQ : Joined<["-"], "cl-std=">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL language standard to compile for.">, Values<"cl,CL,cl1.1,CL1.1,cl1.2,CL1.2,cl2.0,CL2.0">;
	def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow denormals to be flushed to zero.">;
	def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">;
	def client__name : JoinedOrSeparate<["-"], "client_name">;
	def combine : Flag<["-", "--"], "combine">, Flags<[DriverOption, Unsupported]>;
	def compatibility__version : JoinedOrSeparate<["-"], "compatibility_version">;
	def config : Separate<["--"], "config">, Flags<[DriverOption]>,
	HelpText<"Specifies configuration file">;
	def config_system_dir_EQ : Joined<["--"], "config-system-dir=">, Flags<[DriverOption, HelpHidden]>,
	HelpText<"System directory for configuration files">;
	def config_user_dir_EQ : Joined<["--"], "config-user-dir=">, Flags<[DriverOption, HelpHidden]>,
	HelpText<"User directory for configuration files">;
	def coverage : Flag<["-", "--"], "coverage">, Flags<[CoreOption]>;
	def cpp_precomp : Flag<["-"], "cpp-precomp">, Group<clang_ignored_f_Group>;
	def current__version : JoinedOrSeparate<["-"], "current_version">;
	def cxx_isystem : JoinedOrSeparate<["-"], "cxx-isystem">, Group<clang_i_Group>,
	HelpText<"Add directory to the C++ SYSTEM include search path">, Flags<[CC1Option]>,
	MetaVarName<"<directory>">;
	def c : Flag<["-"], "c">, Flags<[DriverOption]>, Group<Action_Group>,
	HelpText<"Only run preprocess, compile, and assemble steps">;
	def cuda_device_only : Flag<["--"], "cuda-device-only">,
	HelpText<"Compile CUDA code for device only">;
	def cuda_host_only : Flag<["--"], "cuda-host-only">,
	HelpText<"Compile CUDA code for host only. Has no effect on non-CUDA "
	"compilations.">;
	def cuda_compile_host_device : Flag<["--"], "cuda-compile-host-device">,
	HelpText<"Compile CUDA code for both host and device (default). Has no "
	"effect on non-CUDA compilations.">;
	def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, Flags<[DriverOption]>,
	HelpText<"CUDA GPU architecture (e.g. sm_35). May be specified more than once.">;
	def no_cuda_gpu_arch_EQ : Joined<["--"], "no-cuda-gpu-arch=">, Flags<[DriverOption]>,
	HelpText<"Remove GPU architecture (e.g. sm_35) from the list of GPUs to compile for. "
	"'all' resets the list to its default value.">;
	def cuda_noopt_device_debug : Flag<["--"], "cuda-noopt-device-debug">,
	HelpText<"Enable device-side debug info generation. Disables ptxas optimizations.">;
	def no_cuda_version_check : Flag<["--"], "no-cuda-version-check">,
	HelpText<"Don't error out if the detected version of the CUDA install is "
	"too low for the requested CUDA gpu architecture.">;
	def no_cuda_noopt_device_debug : Flag<["--"], "no-cuda-noopt-device-debug">;
	def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group<i_Group>,
	HelpText<"CUDA installation path">;
	def ptxas_path_EQ : Joined<["--"], "ptxas-path=">, Group<i_Group>,
	HelpText<"Path to ptxas (used for compiling CUDA code)">;
	def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,
	Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
	def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">;
	def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
	Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
	def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
	def dA : Flag<["-"], "dA">, Group<d_Group>;
	def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print macro definitions in -E mode in addition to normal output">;
	def dI : Flag<["-"], "dI">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print include directives in -E mode in addition to normal output">;
	def dM : Flag<["-"], "dM">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print macro definitions in -E mode instead of normal output">;
	def dead__strip : Flag<["-"], "dead_strip">;
	def dependency_file : Separate<["-"], "dependency-file">, Flags<[CC1Option]>,
	HelpText<"Filename (or -) to write dependency output to">;
	def dependency_dot : Separate<["-"], "dependency-dot">, Flags<[CC1Option]>,
	HelpText<"Filename to write DOT-formatted header dependencies to">;
	def module_dependency_dir : Separate<["-"], "module-dependency-dir">,
	Flags<[CC1Option]>, HelpText<"Directory to dump module dependencies to">;
	def dumpmachine : Flag<["-"], "dumpmachine">;
	def dumpspecs : Flag<["-"], "dumpspecs">, Flags<[Unsupported]>;
	def dumpversion : Flag<["-"], "dumpversion">;
	def dylib__file : Separate<["-"], "dylib_file">;
	def dylinker__install__name : JoinedOrSeparate<["-"], "dylinker_install_name">;
	def dylinker : Flag<["-"], "dylinker">;
	def dynamiclib : Flag<["-"], "dynamiclib">;
	def dynamic : Flag<["-"], "dynamic">, Flags<[NoArgumentUnused]>;
	def d_Flag : Flag<["-"], "d">, Group<d_Group>;
	def d_Joined : Joined<["-"], "d">, Group<d_Group>;
	def emit_ast : Flag<["-"], "emit-ast">,
	HelpText<"Emit Clang AST files for source inputs">;
	def emit_llvm : Flag<["-"], "emit-llvm">, Flags<[CC1Option]>, Group<Action_Group>,
	HelpText<"Use the LLVM representation for assembler and object files">;
	def exported__symbols__list : Separate<["-"], "exported_symbols_list">;
	def e : JoinedOrSeparate<["-"], "e">, Group<Link_Group>;
	def fPIC : Flag<["-"], "fPIC">, Group<f_Group>;
	def fno_PIC : Flag<["-"], "fno-PIC">, Group<f_Group>;
	def fPIE : Flag<["-"], "fPIE">, Group<f_Group>;
	def fno_PIE : Flag<["-"], "fno-PIE">, Group<f_Group>;
	def faccess_control : Flag<["-"], "faccess-control">, Group<f_Group>;
	def fallow_unsupported : Flag<["-"], "fallow-unsupported">, Group<f_Group>;
	def fapple_kext : Flag<["-"], "fapple-kext">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use Apple's kernel extensions ABI">;
	def fapple_pragma_pack : Flag<["-"], "fapple-pragma-pack">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable Apple gcc-compatible #pragma pack handling">;
	def shared_libsan : Flag<["-"], "shared-libsan">;
	def static_libsan : Flag<["-"], "static-libsan">;
	def : Flag<["-"], "shared-libasan">, Alias<shared_libsan>;
	def fasm : Flag<["-"], "fasm">, Group<f_Group>;

	def fasm_blocks : Flag<["-"], "fasm-blocks">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_asm_blocks : Flag<["-"], "fno-asm-blocks">, Group<f_Group>;

	def fassume_sane_operator_new : Flag<["-"], "fassume-sane-operator-new">, Group<f_Group>;
	def fastcp : Flag<["-"], "fastcp">, Group<f_Group>;
	def fastf : Flag<["-"], "fastf">, Group<f_Group>;
	def fast : Flag<["-"], "fast">, Group<f_Group>;
	def fasynchronous_unwind_tables : Flag<["-"], "fasynchronous-unwind-tables">, Group<f_Group>;

	def fdouble_square_bracket_attributes : Flag<[ "-" ], "fdouble-square-bracket-attributes">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable '[[]]' attributes in all C and C++ language modes">;
	def fno_double_square_bracket_attributes : Flag<[ "-" ], "fno-double-square-bracket-attributes">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Disable '[[]]' attributes in all C and C++ language modes">;

	def fautolink : Flag <["-"], "fautolink">, Group<f_Group>;
	def fno_autolink : Flag <["-"], "fno-autolink">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Disable generation of linker directives for automatic library linking">;

	// C++ Coroutines TS
	def fcoroutines_ts : Flag <["-"], "fcoroutines-ts">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable support for the C++ Coroutines TS">;
	def fno_coroutines_ts : Flag <["-"], "fno-coroutines-ts">, Group<f_Group>,
	Flags<[DriverOption]>;

	def fembed_bitcode_EQ : Joined<["-"], "fembed-bitcode=">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>, MetaVarName<"<option>">,
	HelpText<"Embed LLVM bitcode (option: off, all, bitcode, marker)">;
	def fembed_bitcode : Flag<["-"], "fembed-bitcode">, Group<f_Group>,
	Alias<fembed_bitcode_EQ>, AliasArgs<["all"]>,
	HelpText<"Embed LLVM IR bitcode as data">;
	def fembed_bitcode_marker : Flag<["-"], "fembed-bitcode-marker">,
	Alias<fembed_bitcode_EQ>, AliasArgs<["marker"]>,
	HelpText<"Embed placeholder LLVM IR data as a marker">;
	def fgnu_inline_asm : Flag<["-"], "fgnu-inline-asm">, Group<f_Group>, Flags<[DriverOption]>;
	def fno_gnu_inline_asm : Flag<["-"], "fno-gnu-inline-asm">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Disable GNU style inline asm">;

	def fprofile_sample_use : Flag<["-"], "fprofile-sample-use">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fno_profile_sample_use : Flag<["-"], "fno-profile-sample-use">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fprofile_sample_use_EQ : Joined<["-"], "fprofile-sample-use=">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable sample-based profile guided optimizations">;
	def fprofile_sample_accurate : Flag<["-"], "fprofile-sample-accurate">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Specifies that the sample profile is accurate">,
	DocBrief<[{Specifies that the sample profile is accurate. If the sample
	profile is accurate, callsites without profile samples are marked
	as cold. Otherwise, treat callsites without profile samples as if
	we have no profile}]>;
	def fno_profile_sample_accurate : Flag<["-"], "fno-profile-sample-accurate">,
	Group<f_Group>, Flags<[DriverOption]>;
	def fauto_profile : Flag<["-"], "fauto-profile">, Group<f_Group>,
	Alias<fprofile_sample_use>;
	def fno_auto_profile : Flag<["-"], "fno-auto-profile">, Group<f_Group>,
	Alias<fno_profile_sample_use>;
	def fauto_profile_EQ : Joined<["-"], "fauto-profile=">,
	Alias<fprofile_sample_use_EQ>;
	def fauto_profile_accurate : Flag<["-"], "fauto-profile-accurate">,
	Group<f_Group>, Alias<fprofile_sample_accurate>;
	def fno_auto_profile_accurate : Flag<["-"], "fno-auto-profile-accurate">,
	Group<f_Group>, Alias<fno_profile_sample_accurate>;
	def fdebug_info_for_profiling : Flag<["-"], "fdebug-info-for-profiling">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Emit extra debug info to make sample profile more accurate.">;
	def fno_debug_info_for_profiling : Flag<["-"], "fno-debug-info-for-profiling">, Group<f_Group>,
	Flags<[DriverOption]>,
	HelpText<"Do not emit extra debug info for sample profiler.">;
	def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Generate instrumented code to collect execution counts into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">;
	def fprofile_instr_generate_EQ : Joined<["-"], "fprofile-instr-generate=">,
	Group<f_Group>, Flags<[CoreOption]>, MetaVarName<"<file>">,
	HelpText<"Generate instrumented code to collect execution counts into <file> (overridden by LLVM_PROFILE_FILE env var)">;
	def fprofile_instr_use : Flag<["-"], "fprofile-instr-use">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fprofile_instr_use_EQ : Joined<["-"], "fprofile-instr-use=">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Use instrumentation data for profile-guided optimization">;
	def fcoverage_mapping : Flag<["-"], "fcoverage-mapping">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Generate coverage mapping to enable code coverage analysis">;
	def fno_coverage_mapping : Flag<["-"], "fno-coverage-mapping">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Disable code coverage analysis">;
	def fprofile_generate : Flag<["-"], "fprofile-generate">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Generate instrumented code to collect execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
	def fprofile_generate_EQ : Joined<["-"], "fprofile-generate=">,
	Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<directory>">,
	HelpText<"Generate instrumented code to collect execution counts into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
	def fprofile_use : Flag<["-"], "fprofile-use">, Group<f_Group>,
	Alias<fprofile_instr_use>;
	def fprofile_use_EQ : Joined<["-"], "fprofile-use=">,
	Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<pathname>">,
	HelpText<"Use instrumentation data for profile-guided optimization. If pathname is a directory, it reads from <pathname>/default.profdata. Otherwise, it reads from file <pathname>.">;
	def fno_profile_instr_generate : Flag<["-"], "fno-profile-instr-generate">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Disable generation of profile instrumentation.">;
	def fno_profile_generate : Flag<["-"], "fno-profile-generate">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Disable generation of profile instrumentation.">;
	def fno_profile_instr_use : Flag<["-"], "fno-profile-instr-use">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Disable using instrumentation data for profile-guided optimization">;
	def fno_profile_use : Flag<["-"], "fno-profile-use">,
	Alias<fno_profile_instr_use>;

	def fblocks : Flag<["-"], "fblocks">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable the 'blocks' language feature">;
	def fbootclasspath_EQ : Joined<["-"], "fbootclasspath=">, Group<f_Group>;
	def fborland_extensions : Flag<["-"], "fborland-extensions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Accept non-standard constructs supported by the Borland compiler">;
	def fbuiltin : Flag<["-"], "fbuiltin">, Group<f_Group>;
	def fbuiltin_module_map : Flag <["-"], "fbuiltin-module-map">, Group<f_Group>,
	Flags<[DriverOption]>, HelpText<"Load the clang builtins module map file.">;
	def fcaret_diagnostics : Flag<["-"], "fcaret-diagnostics">, Group<f_Group>;
	def fclang_abi_compat_EQ : Joined<["-"], "fclang-abi-compat=">, Group<f_clang_Group>,
	Flags<[CC1Option]>, MetaVarName<"<version>">, Values<"<major>.<minor>,latest">,
	HelpText<"Attempt to match the ABI of Clang <version>">;
	def fclasspath_EQ : Joined<["-"], "fclasspath=">, Group<f_Group>;
	def fcolor_diagnostics : Flag<["-"], "fcolor-diagnostics">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>, HelpText<"Use colors in diagnostics">;
	def fdiagnostics_color : Flag<["-"], "fdiagnostics-color">, Group<f_Group>,
	Flags<[CoreOption, DriverOption]>;
	def fdiagnostics_color_EQ : Joined<["-"], "fdiagnostics-color=">, Group<f_Group>;
	def fansi_escape_codes : Flag<["-"], "fansi-escape-codes">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>, HelpText<"Use ANSI escape codes for diagnostics">;
	def fcomment_block_commands : CommaJoined<["-"], "fcomment-block-commands=">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Treat each comma separated argument in <arg> as a documentation comment block command">,
	MetaVarName<"<arg>">;
	def fparse_all_comments : Flag<["-"], "fparse-all-comments">, Group<f_clang_Group>, Flags<[CC1Option]>;
	def fcommon : Flag<["-"], "fcommon">, Group<f_Group>;
	def fcompile_resource_EQ : Joined<["-"], "fcompile-resource=">, Group<f_Group>;
	def fconstant_cfstrings : Flag<["-"], "fconstant-cfstrings">, Group<f_Group>;
	def fconstant_string_class_EQ : Joined<["-"], "fconstant-string-class=">, Group<f_Group>;
	def fconstexpr_depth_EQ : Joined<["-"], "fconstexpr-depth=">, Group<f_Group>;
	def fconstexpr_steps_EQ : Joined<["-"], "fconstexpr-steps=">, Group<f_Group>;
	def fconstexpr_backtrace_limit_EQ : Joined<["-"], "fconstexpr-backtrace-limit=">,
	Group<f_Group>;
	def fno_crash_diagnostics : Flag<["-"], "fno-crash-diagnostics">, Group<f_clang_Group>, Flags<[NoArgumentUnused]>,
	HelpText<"Disable auto-generation of preprocessed source files and a script for reproduction during a clang crash">;
	def fcreate_profile : Flag<["-"], "fcreate-profile">, Group<f_Group>;
	def fcxx_exceptions: Flag<["-"], "fcxx-exceptions">, Group<f_Group>,
	HelpText<"Enable C++ exceptions">, Flags<[CC1Option]>;
	def fcxx_modules : Flag <["-"], "fcxx-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fdebug_pass_arguments : Flag<["-"], "fdebug-pass-arguments">, Group<f_Group>;
	def fdebug_pass_structure : Flag<["-"], "fdebug-pass-structure">, Group<f_Group>;
	def fdepfile_entry : Joined<["-"], "fdepfile-entry=">,
	Group<f_clang_Group>, Flags<[CC1Option]>;
	def fdiagnostics_fixit_info : Flag<["-"], "fdiagnostics-fixit-info">, Group<f_clang_Group>;
	def fdiagnostics_parseable_fixits : Flag<["-"], "fdiagnostics-parseable-fixits">, Group<f_clang_Group>,
	Flags<[CoreOption, CC1Option]>, HelpText<"Print fix-its in machine parseable form">;
	def fdiagnostics_print_source_range_info : Flag<["-"], "fdiagnostics-print-source-range-info">,
	Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Print source range spans in numeric form">;
	def fdiagnostics_show_hotness : Flag<["-"], "fdiagnostics-show-hotness">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Enable profile hotness information in diagnostic line">;
	def fdiagnostics_hotness_threshold_EQ : Joined<["-"], "fdiagnostics-hotness-threshold=">,
	Group<f_Group>, Flags<[CC1Option]>, MetaVarName<"<number>">,
	HelpText<"Prevent optimization remarks from being output if they do not have at least this profile count">;
	def fdiagnostics_show_option : Flag<["-"], "fdiagnostics-show-option">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Print option name with mappable diagnostics">;
	def fdiagnostics_show_note_include_stack : Flag<["-"], "fdiagnostics-show-note-include-stack">,
	Group<f_Group>, Flags<[CC1Option]>, HelpText<"Display include stacks for diagnostic notes">;
	def fdiagnostics_format_EQ : Joined<["-"], "fdiagnostics-format=">, Group<f_clang_Group>;
	def fdiagnostics_show_category_EQ : Joined<["-"], "fdiagnostics-show-category=">, Group<f_clang_Group>;
	def fdiagnostics_show_template_tree : Flag<["-"], "fdiagnostics-show-template-tree">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Print a template comparison tree for differing templates">;
	def fdeclspec : Flag<["-"], "fdeclspec">, Group<f_clang_Group>,
	HelpText<"Allow __declspec as a keyword">, Flags<[CC1Option]>;
	def fdollars_in_identifiers : Flag<["-"], "fdollars-in-identifiers">, Group<f_Group>,
	HelpText<"Allow '$' in identifiers">, Flags<[CC1Option]>;
	def fdwarf2_cfi_asm : Flag<["-"], "fdwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
	def fno_dwarf2_cfi_asm : Flag<["-"], "fno-dwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
	def fdwarf_directory_asm : Flag<["-"], "fdwarf-directory-asm">, Group<f_Group>;
	def fno_dwarf_directory_asm : Flag<["-"], "fno-dwarf-directory-asm">, Group<f_Group>, Flags<[CC1Option]>;
	def felide_constructors : Flag<["-"], "felide-constructors">, Group<f_Group>;
	def fno_elide_type : Flag<["-"], "fno-elide-type">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Do not elide types when printing diagnostics">;
	def feliminate_unused_debug_symbols : Flag<["-"], "feliminate-unused-debug-symbols">, Group<f_Group>;
	def femit_all_decls : Flag<["-"], "femit-all-decls">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Emit all declarations, even if unused">;
	def femulated_tls : Flag<["-"], "femulated-tls">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use emutls functions to access thread_local variables">;
	def fno_emulated_tls : Flag<["-"], "fno-emulated-tls">, Group<f_Group>;
	def fencoding_EQ : Joined<["-"], "fencoding=">, Group<f_Group>;
	def ferror_limit_EQ : Joined<["-"], "ferror-limit=">, Group<f_Group>, Flags<[CoreOption]>;
	def fexceptions : Flag<["-"], "fexceptions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable support for exception handling">;
	def fdwarf_exceptions : Flag<["-"], "fdwarf-exceptions">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Use DWARF style exceptions">;
	def fsjlj_exceptions : Flag<["-"], "fsjlj-exceptions">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Use SjLj style exceptions">;
	def fseh_exceptions : Flag<["-"], "fseh-exceptions">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Use SEH style exceptions">;
	def fexcess_precision_EQ : Joined<["-"], "fexcess-precision=">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fexpensive-optimizations">, Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fno-expensive-optimizations">, Group<clang_ignored_gcc_optimization_f_Group>;
	def fextdirs_EQ : Joined<["-"], "fextdirs=">, Group<f_Group>;
	def : Flag<["-"], "fdefer-pop">, Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fno-defer-pop">, Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fextended-identifiers">, Group<clang_ignored_f_Group>;
	def : Flag<["-"], "fno-extended-identifiers">, Group<f_Group>, Flags<[Unsupported]>;
	def fhosted : Flag<["-"], "fhosted">, Group<f_Group>;
	def fdenormal_fp_math_EQ : Joined<["-"], "fdenormal-fp-math=">, Group<f_Group>, Flags<[CC1Option]>;
	def ffast_math : Flag<["-"], "ffast-math">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow aggressive, lossy floating-point optimizations">;
	def fno_fast_math : Flag<["-"], "fno-fast-math">, Group<f_Group>;
	def fmath_errno : Flag<["-"], "fmath-errno">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Require math functions to indicate errors by setting errno">;
	def fno_math_errno : Flag<["-"], "fno-math-errno">, Group<f_Group>;
	def fbracket_depth_EQ : Joined<["-"], "fbracket-depth=">, Group<f_Group>;
	def fsignaling_math : Flag<["-"], "fsignaling-math">, Group<f_Group>;
	def fno_signaling_math : Flag<["-"], "fno-signaling-math">, Group<f_Group>;
	def fjump_tables : Flag<["-"], "fjump-tables">, Group<f_Group>;
	def fno_jump_tables : Flag<["-"], "fno-jump-tables">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Do not use jump tables for lowering switches">;

	// Begin sanitizer flags. These should all be core options exposed in all driver
	// modes.
	let Flags = [CC1Option, CoreOption] in {

	def fsanitize_EQ : CommaJoined<["-"], "fsanitize=">, Group<f_clang_Group>,
	MetaVarName<"<check>">,
	HelpText<"Turn on runtime checks for various forms of undefined "
	"or suspicious behavior. See user manual for available checks">;
	def fno_sanitize_EQ : CommaJoined<["-"], "fno-sanitize=">, Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>;
	def fsanitize_blacklist : Joined<["-"], "fsanitize-blacklist=">,
	Group<f_clang_Group>,
	HelpText<"Path to blacklist file for sanitizers">;
	def fno_sanitize_blacklist : Flag<["-"], "fno-sanitize-blacklist">,
	Group<f_clang_Group>,
	HelpText<"Don't use blacklist file for sanitizers">;
	def fsanitize_coverage
	: CommaJoined<["-"], "fsanitize-coverage=">,
	Group<f_clang_Group>,
	HelpText<"Specify the type of coverage instrumentation for Sanitizers">;
	def fno_sanitize_coverage
	: CommaJoined<["-"], "fno-sanitize-coverage=">,
	Group<f_clang_Group>, Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable specified features of coverage instrumentation for "
	"Sanitizers">, Values<"func,bb,edge,indirect-calls,trace-bb,trace-cmp,trace-div,trace-gep,8bit-counters,trace-pc,trace-pc-guard,no-prune,inline-8bit-counters">;
	def fsanitize_memory_track_origins_EQ : Joined<["-"], "fsanitize-memory-track-origins=">,
	Group<f_clang_Group>,
	HelpText<"Enable origins tracking in MemorySanitizer">;
	def fsanitize_memory_track_origins : Flag<["-"], "fsanitize-memory-track-origins">,
	Group<f_clang_Group>,
	HelpText<"Enable origins tracking in MemorySanitizer">;
	def fno_sanitize_memory_track_origins : Flag<["-"], "fno-sanitize-memory-track-origins">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable origins tracking in MemorySanitizer">;
	def fsanitize_memory_use_after_dtor : Flag<["-"], "fsanitize-memory-use-after-dtor">,
	Group<f_clang_Group>,
	HelpText<"Enable use-after-destroy detection in MemorySanitizer">;
	def fno_sanitize_memory_use_after_dtor : Flag<["-"], "fno-sanitize-memory-use-after-dtor">,
	Group<f_clang_Group>,
	HelpText<"Disable use-after-destroy detection in MemorySanitizer">;
	def fsanitize_address_field_padding : Joined<["-"], "fsanitize-address-field-padding=">,
	Group<f_clang_Group>,
	HelpText<"Level of field padding for AddressSanitizer">;
	def fsanitize_address_use_after_scope : Flag<["-"], "fsanitize-address-use-after-scope">,
	Group<f_clang_Group>,
	HelpText<"Enable use-after-scope detection in AddressSanitizer">;
	def fno_sanitize_address_use_after_scope : Flag<["-"], "fno-sanitize-address-use-after-scope">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable use-after-scope detection in AddressSanitizer">;
	def fsanitize_address_globals_dead_stripping : Flag<["-"], "fsanitize-address-globals-dead-stripping">,
	Group<f_clang_Group>,
	HelpText<"Enable linker dead stripping of globals in AddressSanitizer">;
	def fsanitize_recover : Flag<["-"], "fsanitize-recover">, Group<f_clang_Group>;
	def fno_sanitize_recover : Flag<["-"], "fno-sanitize-recover">,
	Flags<[CoreOption, DriverOption]>,
	Group<f_clang_Group>;
	def fsanitize_recover_EQ : CommaJoined<["-"], "fsanitize-recover=">,
	Group<f_clang_Group>,
	HelpText<"Enable recovery for specified sanitizers">;
	def fno_sanitize_recover_EQ
	: CommaJoined<["-"], "fno-sanitize-recover=">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable recovery for specified sanitizers">;
	def fsanitize_trap_EQ : CommaJoined<["-"], "fsanitize-trap=">, Group<f_clang_Group>,
	HelpText<"Enable trapping for specified sanitizers">;
	def fno_sanitize_trap_EQ : CommaJoined<["-"], "fno-sanitize-trap=">, Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable trapping for specified sanitizers">;
	def fsanitize_undefined_trap_on_error : Flag<["-"], "fsanitize-undefined-trap-on-error">,
	Group<f_clang_Group>;
	def fno_sanitize_undefined_trap_on_error : Flag<["-"], "fno-sanitize-undefined-trap-on-error">,
	Group<f_clang_Group>;
	def fsanitize_minimal_runtime : Flag<["-"], "fsanitize-minimal-runtime">,
	Group<f_clang_Group>;
	def fno_sanitize_minimal_runtime : Flag<["-"], "fno-sanitize-minimal-runtime">,
	Group<f_clang_Group>;
	def fsanitize_link_cxx_runtime : Flag<["-"], "fsanitize-link-c++-runtime">,
	Group<f_clang_Group>;
	def fsanitize_cfi_cross_dso : Flag<["-"], "fsanitize-cfi-cross-dso">,
	Group<f_clang_Group>,
	HelpText<"Enable control flow integrity (CFI) checks for cross-DSO calls.">;
	def fno_sanitize_cfi_cross_dso : Flag<["-"], "fno-sanitize-cfi-cross-dso">,
	Flags<[CoreOption, DriverOption]>,
	Group<f_clang_Group>,
	HelpText<"Disable control flow integrity (CFI) checks for cross-DSO calls.">;
	def fsanitize_cfi_icall_generalize_pointers : Flag<["-"], "fsanitize-cfi-icall-generalize-pointers">,
	Group<f_clang_Group>,
	HelpText<"Generalize pointers in CFI indirect call type signature checks">;
	def fsanitize_stats : Flag<["-"], "fsanitize-stats">,
	Group<f_clang_Group>,
	HelpText<"Enable sanitizer statistics gathering.">;
	def fno_sanitize_stats : Flag<["-"], "fno-sanitize-stats">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable sanitizer statistics gathering.">;
	def fsanitize_thread_memory_access : Flag<["-"], "fsanitize-thread-memory-access">,
	Group<f_clang_Group>,
	HelpText<"Enable memory access instrumentation in ThreadSanitizer (default)">;
	def fno_sanitize_thread_memory_access : Flag<["-"], "fno-sanitize-thread-memory-access">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable memory access instrumentation in ThreadSanitizer">;
	def fsanitize_thread_func_entry_exit : Flag<["-"], "fsanitize-thread-func-entry-exit">,
	Group<f_clang_Group>,
	HelpText<"Enable function entry/exit instrumentation in ThreadSanitizer (default)">;
	def fno_sanitize_thread_func_entry_exit : Flag<["-"], "fno-sanitize-thread-func-entry-exit">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable function entry/exit instrumentation in ThreadSanitizer">;
	def fsanitize_thread_atomics : Flag<["-"], "fsanitize-thread-atomics">,
	Group<f_clang_Group>,
	HelpText<"Enable atomic operations instrumentation in ThreadSanitizer (default)">;
	def fno_sanitize_thread_atomics : Flag<["-"], "fno-sanitize-thread-atomics">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable atomic operations instrumentation in ThreadSanitizer">;
	def fsanitize_undefined_strip_path_components_EQ : Joined<["-"], "fsanitize-undefined-strip-path-components=">,
	Group<f_clang_Group>, MetaVarName<"<number>">,
	HelpText<"Strip (or keep only, if negative) a given number of path components "
	"when emitting check metadata.">;

	} // end -f[no-]sanitize* flags

	def funsafe_math_optimizations : Flag<["-"], "funsafe-math-optimizations">,
	Group<f_Group>;
	def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">,
	Group<f_Group>;
	def fassociative_math : Flag<["-"], "fassociative-math">, Group<f_Group>;
	def fno_associative_math : Flag<["-"], "fno-associative-math">, Group<f_Group>;
	def freciprocal_math :
	Flag<["-"], "freciprocal-math">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow division operations to be reassociated">;
	def fno_reciprocal_math : Flag<["-"], "fno-reciprocal-math">, Group<f_Group>;
	def ffinite_math_only : Flag<["-"], "ffinite-math-only">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_finite_math_only : Flag<["-"], "fno-finite-math-only">, Group<f_Group>;
	def fsigned_zeros : Flag<["-"], "fsigned-zeros">, Group<f_Group>;
	def fno_signed_zeros :
	Flag<["-"], "fno-signed-zeros">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow optimizations that ignore the sign of floating point zeros">;
	def fhonor_nans : Flag<["-"], "fhonor-nans">, Group<f_Group>;
	def fno_honor_nans : Flag<["-"], "fno-honor-nans">, Group<f_Group>;
	def fhonor_infinities : Flag<["-"], "fhonor-infinities">, Group<f_Group>;
	def fno_honor_infinities : Flag<["-"], "fno-honor-infinities">, Group<f_Group>;
	// This option was originally misspelt "infinites" [sic].
	def : Flag<["-"], "fhonor-infinites">, Alias<fhonor_infinities>;
	def : Flag<["-"], "fno-honor-infinites">, Alias<fno_honor_infinities>;
	def ftrapping_math : Flag<["-"], "ftrapping-math">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_trapping_math : Flag<["-"], "fno-trapping-math">, Group<f_Group>, Flags<[CC1Option]>;
	def ffp_contract : Joined<["-"], "ffp-contract=">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Form fused FP ops (e.g. FMAs): fast (everywhere)"
	" \| on (according to FP_CONTRACT pragma, default) \| off (never fuse)">, Values<"fast,on,off">;

	def ffor_scope : Flag<["-"], "ffor-scope">, Group<f_Group>;
	def fno_for_scope : Flag<["-"], "fno-for-scope">, Group<f_Group>;

	def frewrite_includes : Flag<["-"], "frewrite-includes">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fno_rewrite_includes : Flag<["-"], "fno-rewrite-includes">, Group<f_Group>;

	def frewrite_imports : Flag<["-"], "frewrite-imports">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fno_rewrite_imports : Flag<["-"], "fno-rewrite-imports">, Group<f_Group>;

	def frewrite_map_file : Separate<["-"], "frewrite-map-file">,
	Group<f_Group>,
	Flags<[ DriverOption, CC1Option ]>;
	def frewrite_map_file_EQ : Joined<["-"], "frewrite-map-file=">,
	Group<f_Group>,
	Flags<[DriverOption]>;

	def fuse_line_directives : Flag<["-"], "fuse-line-directives">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fno_use_line_directives : Flag<["-"], "fno-use-line-directives">, Group<f_Group>;

	def ffreestanding : Flag<["-"], "ffreestanding">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Assert that the compilation takes place in a freestanding environment">;
	def fgnu_keywords : Flag<["-"], "fgnu-keywords">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow GNU-extension keywords regardless of language standard">;
	def fgnu89_inline : Flag<["-"], "fgnu89-inline">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use the gnu89 inline semantics">;
	def fno_gnu89_inline : Flag<["-"], "fno-gnu89-inline">, Group<f_Group>;
	def fgnu_runtime : Flag<["-"], "fgnu-runtime">, Group<f_Group>,
	HelpText<"Generate output compatible with the standard GNU Objective-C runtime">;
	def fheinous_gnu_extensions : Flag<["-"], "fheinous-gnu-extensions">, Flags<[CC1Option]>;
	def filelist : Separate<["-"], "filelist">, Flags<[LinkerInput]>,
	Group<Link_Group>;
	def : Flag<["-"], "findirect-virtual-calls">, Alias<fapple_kext>;
	def finline_functions : Flag<["-"], "finline-functions">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Inline suitable functions">;
	def finline_hint_functions: Flag<["-"], "finline-hint-functions">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Inline functions which are (explicitly or implicitly) marked inline">;
	def finline : Flag<["-"], "finline">, Group<clang_ignored_f_Group>;
	def fexperimental_isel : Flag<["-"], "fexperimental-isel">, Group<f_clang_Group>,
	HelpText<"Enables the experimental global instruction selector">;
	def fexperimental_new_pass_manager : Flag<["-"], "fexperimental-new-pass-manager">,
	Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Enables an experimental new pass manager in LLVM.">;
	def finput_charset_EQ : Joined<["-"], "finput-charset=">, Group<f_Group>;
	def fexec_charset_EQ : Joined<["-"], "fexec-charset=">, Group<f_Group>;
	def finstrument_functions : Flag<["-"], "finstrument-functions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Generate calls to instrument function entry and exit">;
	def finstrument_functions_after_inlining : Flag<["-"], "finstrument-functions-after-inlining">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Like -finstrument-functions, but insert the calls after inlining">;
	def finstrument_function_entry_bare : Flag<["-"], "finstrument-function-entry-bare">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Instrument function entry only, after inlining, without arguments to the instrumentation call">;

	def fxray_instrument : Flag<["-"], "fxray-instrument">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Generate XRay instrumentation sleds on function entry and exit">;
	def fnoxray_instrument : Flag<["-"], "fno-xray-instrument">, Group<f_Group>,
	Flags<[CC1Option]>;

	def fxray_instruction_threshold_EQ :
	JoinedOrSeparate<["-"], "fxray-instruction-threshold=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Sets the minimum function size to instrument with XRay">;
	def fxray_instruction_threshold_ :
	JoinedOrSeparate<["-"], "fxray-instruction-threshold">,
	Group<f_Group>, Flags<[CC1Option]>;

	def fxray_always_instrument :
	JoinedOrSeparate<["-"], "fxray-always-instrument=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Filename defining the whitelist for imbuing the 'always instrument' XRay attribute.">;
	def fxray_never_instrument :
	JoinedOrSeparate<["-"], "fxray-never-instrument=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Filename defining the whitelist for imbuing the 'never instrument' XRay attribute.">;

	def fxray_always_emit_customevents : Flag<["-"], "fxray-always-emit-customevents">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Determine whether to always emit __xray_customevent(...) calls even if the function it appears in is not always instrumented.">;
	def fnoxray_always_emit_customevents : Flag<["-"], "fno-xray-always-emit-customevents">, Group<f_Group>,
	Flags<[CC1Option]>;

	def ffine_grained_bitfield_accesses : Flag<["-"],
	"ffine-grained-bitfield-accesses">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Use separate accesses for bitfields with legal widths and alignments.">;
	def fno_fine_grained_bitfield_accesses : Flag<["-"],
	"fno-fine-grained-bitfield-accesses">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Use large-integer access for consecutive bitfield runs.">;

	def flat__namespace : Flag<["-"], "flat_namespace">;
	def flax_vector_conversions : Flag<["-"], "flax-vector-conversions">, Group<f_Group>;
	def flimited_precision_EQ : Joined<["-"], "flimited-precision=">, Group<f_Group>;
	def flto_EQ : Joined<["-"], "flto=">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Set LTO mode to either 'full' or 'thin'">, Values<"thin,full">;
	def flto : Flag<["-"], "flto">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Enable LTO in 'full' mode">;
	def fno_lto : Flag<["-"], "fno-lto">, Group<f_Group>,
	HelpText<"Disable LTO mode (default)">;
	def flto_jobs_EQ : Joined<["-"], "flto-jobs=">,
	Flags<[CC1Option]>, Group<f_Group>,
	HelpText<"Controls the backend parallelism of -flto=thin (default "
	"of 0 means the number of threads will be derived from "
	"the number of CPUs detected)">;
	def fthinlto_index_EQ : Joined<["-"], "fthinlto-index=">,
	Flags<[CC1Option]>, Group<f_Group>,
	HelpText<"Perform ThinLTO importing using provided function summary index">;
	def fmacro_backtrace_limit_EQ : Joined<["-"], "fmacro-backtrace-limit=">,
	Group<f_Group>, Flags<[DriverOption, CoreOption]>;
	def fmerge_all_constants : Flag<["-"], "fmerge-all-constants">, Group<f_Group>;
	def fmessage_length_EQ : Joined<["-"], "fmessage-length=">, Group<f_Group>;
	def fms_extensions : Flag<["-"], "fms-extensions">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Accept some non-standard constructs supported by the Microsoft compiler">;
	def fms_compatibility : Flag<["-"], "fms-compatibility">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Enable full Microsoft Visual C++ compatibility">;
	def fms_volatile : Joined<["-"], "fms-volatile">, Group<f_Group>, Flags<[CC1Option]>;
	def fmsc_version : Joined<["-"], "fmsc-version=">, Group<f_Group>, Flags<[DriverOption, CoreOption]>,
	HelpText<"Microsoft compiler version number to report in _MSC_VER (0 = don't define it (default))">;
	def fms_compatibility_version
	: Joined<["-"], "fms-compatibility-version=">,
	Group<f_Group>,
	Flags<[ CC1Option, CoreOption ]>,
	HelpText<"Dot-separated value representing the Microsoft compiler "
	"version number to report in _MSC_VER (0 = don't define it "
	"(default))">;
	def fdelayed_template_parsing : Flag<["-"], "fdelayed-template-parsing">, Group<f_Group>,
	HelpText<"Parse templated function definitions at the end of the "
	"translation unit">, Flags<[CC1Option, CoreOption]>;
	def fms_memptr_rep_EQ : Joined<["-"], "fms-memptr-rep=">, Group<f_Group>, Flags<[CC1Option]>;
	def fmodules_cache_path : Joined<["-"], "fmodules-cache-path=">, Group<i_Group>,
	Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
	HelpText<"Specify the module cache path">;
	def fmodules_user_build_path : Separate<["-"], "fmodules-user-build-path">, Group<i_Group>,
	Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
	HelpText<"Specify the module user build path">;
	def fprebuilt_module_path : Joined<["-"], "fprebuilt-module-path=">, Group<i_Group>,
	Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
	HelpText<"Specify the prebuilt module path">;
	def fmodules_prune_interval : Joined<["-"], "fmodules-prune-interval=">, Group<i_Group>,
	Flags<[CC1Option]>, MetaVarName<"<seconds>">,
	HelpText<"Specify the interval (in seconds) between attempts to prune the module cache">;
	def fmodules_prune_after : Joined<["-"], "fmodules-prune-after=">, Group<i_Group>,
	Flags<[CC1Option]>, MetaVarName<"<seconds>">,
	HelpText<"Specify the interval (in seconds) after which a module file will be considered unused">;
	def fmodules_search_all : Flag <["-"], "fmodules-search-all">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Search even non-imported modules to resolve references">;
	def fbuild_session_timestamp : Joined<["-"], "fbuild-session-timestamp=">,
	Group<i_Group>, Flags<[CC1Option]>, MetaVarName<"<time since Epoch in seconds>">,
	HelpText<"Time when the current build session started">;
	def fbuild_session_file : Joined<["-"], "fbuild-session-file=">,
	Group<i_Group>, MetaVarName<"<file>">,
	HelpText<"Use the last modification time of <file> as the build session timestamp">;
	def fmodules_validate_once_per_build_session : Flag<["-"], "fmodules-validate-once-per-build-session">,
	Group<i_Group>, Flags<[CC1Option]>,
	HelpText<"Don't verify input files for the modules if the module has been "
	"successfully validated or loaded during this build session">;
	def fmodules_disable_diagnostic_validation : Flag<["-"], "fmodules-disable-diagnostic-validation">,
	Group<i_Group>, Flags<[CC1Option]>,
	HelpText<"Disable validation of the diagnostic options when loading the module">;
	def fmodules_validate_system_headers : Flag<["-"], "fmodules-validate-system-headers">,
	Group<i_Group>, Flags<[CC1Option]>,
	HelpText<"Validate the system headers that a module depends on when loading the module">;
	def fmodules : Flag <["-"], "fmodules">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable the 'modules' language feature">;
	def fimplicit_module_maps : Flag <["-"], "fimplicit-module-maps">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Implicitly search the file system for module map files.">;
	def fmodules_ts : Flag <["-"], "fmodules-ts">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Enable support for the C++ Modules TS">;
	def fmodule_maps : Flag <["-"], "fmodule-maps">, Alias<fimplicit_module_maps>;
	def fmodule_name_EQ : Joined<["-"], "fmodule-name=">, Group<f_Group>,
	Flags<[DriverOption,CC1Option]>, MetaVarName<"<name>">,
	HelpText<"Specify the name of the module to build">;
	def fmodule_name : Separate<["-"], "fmodule-name">, Alias<fmodule_name_EQ>;
	def fmodule_implementation_of : Separate<["-"], "fmodule-implementation-of">,
	Flags<[CC1Option]>, Alias<fmodule_name_EQ>;
	def fmodule_map_file : Joined<["-"], "fmodule-map-file=">,
	Group<f_Group>, Flags<[DriverOption,CC1Option]>, MetaVarName<"<file>">,
	HelpText<"Load this module map file">;
	def fmodule_file : Joined<["-"], "fmodule-file=">,
	Group<i_Group>, Flags<[DriverOption,CC1Option]>, MetaVarName<"[<name>=]<file>">,
	HelpText<"Specify the mapping of module name to precompiled module file, or load a module file if name is omitted.">;
	def fmodules_ignore_macro : Joined<["-"], "fmodules-ignore-macro=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Ignore the definition of the given macro when building and loading modules">;
	def fmodules_decluse : Flag <["-"], "fmodules-decluse">, Group<f_Group>,
	Flags<[DriverOption,CC1Option]>,
	HelpText<"Require declaration of modules used within a module">;
	def fmodules_strict_decluse : Flag <["-"], "fmodules-strict-decluse">, Group<f_Group>,
	Flags<[DriverOption,CC1Option]>,
	HelpText<"Like -fmodules-decluse but requires all headers to be in modules">;
	def fno_modules_search_all : Flag <["-"], "fno-modules-search-all">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>;
	def fno_implicit_modules :
	Flag <["-"], "fno-implicit-modules">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>;
	def fretain_comments_from_system_headers : Flag<["-"], "fretain-comments-from-system-headers">, Group<f_Group>, Flags<[CC1Option]>;

	def fmudflapth : Flag<["-"], "fmudflapth">, Group<f_Group>;
	def fmudflap : Flag<["-"], "fmudflap">, Group<f_Group>;
	def fnested_functions : Flag<["-"], "fnested-functions">, Group<f_Group>;
	def fnext_runtime : Flag<["-"], "fnext-runtime">, Group<f_Group>;
	def fno_access_control : Flag<["-"], "fno-access-control">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disable C++ access control">;
	def fno_apple_pragma_pack : Flag<["-"], "fno-apple-pragma-pack">, Group<f_Group>;
	def fno_asm : Flag<["-"], "fno-asm">, Group<f_Group>;
	def fno_asynchronous_unwind_tables : Flag<["-"], "fno-asynchronous-unwind-tables">, Group<f_Group>;
	def fno_assume_sane_operator_new : Flag<["-"], "fno-assume-sane-operator-new">, Group<f_Group>,
	HelpText<"Don't assume that C++'s global operator new can't alias any pointer">,
	Flags<[CC1Option]>;
	def fno_blocks : Flag<["-"], "fno-blocks">, Group<f_Group>;
	def fno_borland_extensions : Flag<["-"], "fno-borland-extensions">, Group<f_Group>;
	def fno_builtin : Flag<["-"], "fno-builtin">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disable implicit builtin knowledge of functions">;
	def fno_builtin_ : Joined<["-"], "fno-builtin-">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disable implicit builtin knowledge of a specific function">;
	def fno_caret_diagnostics : Flag<["-"], "fno-caret-diagnostics">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fno_color_diagnostics : Flag<["-"], "fno-color-diagnostics">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>;
	def fno_diagnostics_color : Flag<["-"], "fno-diagnostics-color">, Group<f_Group>,
	Flags<[CoreOption, DriverOption]>;
	def fno_common : Flag<["-"], "fno-common">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Compile common globals like normal definitions">;
	def fno_constant_cfstrings : Flag<["-"], "fno-constant-cfstrings">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Disable creation of CodeFoundation-type constant strings">;
	def fno_cxx_exceptions: Flag<["-"], "fno-cxx-exceptions">, Group<f_Group>;
	def fno_cxx_modules : Flag <["-"], "fno-cxx-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_diagnostics_fixit_info : Flag<["-"], "fno-diagnostics-fixit-info">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Do not include fixit information in diagnostics">;
	def fno_diagnostics_show_hotness : Flag<["-"], "fno-diagnostics-show-hotness">, Group<f_Group>;
	def fno_diagnostics_show_option : Flag<["-"], "fno-diagnostics-show-option">, Group<f_Group>;
	def fno_diagnostics_show_note_include_stack : Flag<["-"], "fno-diagnostics-show-note-include-stack">,
	Flags<[CC1Option]>, Group<f_Group>;
	def fno_declspec : Flag<["-"], "fno-declspec">, Group<f_clang_Group>,
	HelpText<"Disallow __declspec as a keyword">, Flags<[CC1Option]>;
	def fno_dollars_in_identifiers : Flag<["-"], "fno-dollars-in-identifiers">, Group<f_Group>,
	HelpText<"Disallow '$' in identifiers">, Flags<[CC1Option]>;
	def fno_elide_constructors : Flag<["-"], "fno-elide-constructors">, Group<f_Group>,
	HelpText<"Disable C++ copy constructor elision">, Flags<[CC1Option]>;
	def fno_eliminate_unused_debug_symbols : Flag<["-"], "fno-eliminate-unused-debug-symbols">, Group<f_Group>;
	def fno_exceptions : Flag<["-"], "fno-exceptions">, Group<f_Group>;
	def fno_gnu_keywords : Flag<["-"], "fno-gnu-keywords">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_inline_functions : Flag<["-"], "fno-inline-functions">, Group<f_clang_Group>, Flags<[CC1Option]>;
	def fno_inline : Flag<["-"], "fno-inline">, Group<f_clang_Group>, Flags<[CC1Option]>;
	def fno_experimental_isel : Flag<["-"], "fno-experimental-isel">, Group<f_clang_Group>,
	HelpText<"Disables the experimental global instruction selector">;
	def fno_experimental_new_pass_manager : Flag<["-"], "fno-experimental-new-pass-manager">,
	Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Disables an experimental new pass manager in LLVM.">;
	def fveclib : Joined<["-"], "fveclib=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use the given vector functions library">, Values<"Accelerate,SVML,none">;
	def fno_lax_vector_conversions : Flag<["-"], "fno-lax-vector-conversions">, Group<f_Group>,
	HelpText<"Disallow implicit conversions between vectors with a different number of elements or different element types">, Flags<[CC1Option]>;
	def fno_merge_all_constants : Flag<["-"], "fno-merge-all-constants">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Disallow merging of constants">;
	def fno_modules : Flag <["-"], "fno-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_implicit_module_maps : Flag <["-"], "fno-implicit-module-maps">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_module_maps : Flag <["-"], "fno-module-maps">, Alias<fno_implicit_module_maps>;
	def fno_modules_decluse : Flag <["-"], "fno-modules-decluse">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_modules_strict_decluse : Flag <["-"], "fno-strict-modules-decluse">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fimplicit_modules : Flag <["-"], "fimplicit-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fmodule_file_deps : Flag <["-"], "fmodule-file-deps">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_module_file_deps : Flag <["-"], "fno-module-file-deps">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_ms_extensions : Flag<["-"], "fno-ms-extensions">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fno_ms_compatibility : Flag<["-"], "fno-ms-compatibility">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fno_delayed_template_parsing : Flag<["-"], "fno-delayed-template-parsing">, Group<f_Group>,
	HelpText<"Disable delayed template parsing">,
	Flags<[DriverOption, CoreOption]>;
	def fno_objc_exceptions: Flag<["-"], "fno-objc-exceptions">, Group<f_Group>;
	def fno_objc_legacy_dispatch : Flag<["-"], "fno-objc-legacy-dispatch">, Group<f_Group>;
	def fno_objc_weak : Flag<["-"], "fno-objc-weak">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_omit_frame_pointer : Flag<["-"], "fno-omit-frame-pointer">, Group<f_Group>;
	def fno_operator_names : Flag<["-"], "fno-operator-names">, Group<f_Group>,
	HelpText<"Do not treat C++ operator name keywords as synonyms for operators">,
	Flags<[CC1Option]>;
	def fno_pascal_strings : Flag<["-"], "fno-pascal-strings">, Group<f_Group>;
	def fno_rtti : Flag<["-"], "fno-rtti">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disable generation of rtti information">;
	def fno_short_enums : Flag<["-"], "fno-short-enums">, Group<f_Group>;
	def fno_show_column : Flag<["-"], "fno-show-column">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Do not include column number on diagnostics">;
	def fno_show_source_location : Flag<["-"], "fno-show-source-location">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Do not include source location information with diagnostics">;
	def fdiagnostics_absolute_paths : Flag<["-"], "fdiagnostics-absolute-paths">, Group<f_Group>,
	Flags<[CC1Option, CoreOption]>, HelpText<"Print absolute paths in diagnostics">;
	def fno_spell_checking : Flag<["-"], "fno-spell-checking">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Disable spell-checking">;
	def fno_stack_protector : Flag<["-"], "fno-stack-protector">, Group<f_Group>,
	HelpText<"Disable the use of stack protectors">;
	def fno_strict_aliasing : Flag<["-"], "fno-strict-aliasing">, Group<f_Group>,
	Flags<[DriverOption, CoreOption]>;
	def fstruct_path_tbaa : Flag<["-"], "fstruct-path-tbaa">, Group<f_Group>;
	def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group<f_Group>;
	def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group<f_Group>;
	def fno_strict_vtable_pointers: Flag<["-"], "fno-strict-vtable-pointers">,
	Group<f_Group>;
	def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group<f_Group>;
	def fno_threadsafe_statics : Flag<["-"], "fno-threadsafe-statics">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Do not emit code to make initialization of local statics thread safe">;
	def fno_use_cxa_atexit : Flag<["-"], "fno-use-cxa-atexit">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Don't use __cxa_atexit for calling destructors">;
	def fno_use_init_array : Flag<["-"], "fno-use-init-array">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Don't use .init_array instead of .ctors">;
	def fno_unit_at_a_time : Flag<["-"], "fno-unit-at-a-time">, Group<f_Group>;
	def fno_unwind_tables : Flag<["-"], "fno-unwind-tables">, Group<f_Group>;
	def fno_verbose_asm : Flag<["-"], "fno-verbose-asm">, Group<f_Group>;
	def fno_working_directory : Flag<["-"], "fno-working-directory">, Group<f_Group>;
	def fno_wrapv : Flag<["-"], "fno-wrapv">, Group<f_Group>;
	def fno_zero_initialized_in_bss : Flag<["-"], "fno-zero-initialized-in-bss">, Group<f_Group>;
	def fobjc_arc : Flag<["-"], "fobjc-arc">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Synthesize retain and release calls for Objective-C pointers">;
	def fno_objc_arc : Flag<["-"], "fno-objc-arc">, Group<f_Group>;
	def fobjc_arc_exceptions : Flag<["-"], "fobjc-arc-exceptions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use EH-safe code when synthesizing retains and releases in -fobjc-arc">;
	def fno_objc_arc_exceptions : Flag<["-"], "fno-objc-arc-exceptions">, Group<f_Group>;
	def fobjc_atdefs : Flag<["-"], "fobjc-atdefs">, Group<clang_ignored_f_Group>;
	def fobjc_call_cxx_cdtors : Flag<["-"], "fobjc-call-cxx-cdtors">, Group<clang_ignored_f_Group>;
	def fobjc_exceptions: Flag<["-"], "fobjc-exceptions">, Group<f_Group>,
	HelpText<"Enable Objective-C exceptions">, Flags<[CC1Option]>;
	def fapplication_extension : Flag<["-"], "fapplication-extension">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Restrict code to those available for App Extensions">;
	def fno_application_extension : Flag<["-"], "fno-application-extension">,
	Group<f_Group>;
	def frelaxed_template_template_args : Flag<["-"], "frelaxed-template-template-args">,
	Flags<[CC1Option]>, HelpText<"Enable C++17 relaxed template template argument matching">,
	Group<f_Group>;
	def fno_relaxed_template_template_args : Flag<["-"], "fno-relaxed-template-template-args">,
	Group<f_Group>;
	def fsized_deallocation : Flag<["-"], "fsized-deallocation">, Flags<[CC1Option]>,
	HelpText<"Enable C++14 sized global deallocation functions">, Group<f_Group>;
	def fno_sized_deallocation: Flag<["-"], "fno-sized-deallocation">, Group<f_Group>;
	def faligned_allocation : Flag<["-"], "faligned-allocation">, Flags<[CC1Option]>,
	HelpText<"Enable C++17 aligned allocation functions">, Group<f_Group>;
	def fno_aligned_allocation: Flag<["-"], "fno-aligned-allocation">,
	Group<f_Group>, Flags<[CC1Option]>;
	def fnew_alignment_EQ : Joined<["-"], "fnew-alignment=">,
	HelpText<"Specifies the largest alignment guaranteed by '::operator new(size_t)'">,
	MetaVarName<"<align>">, Group<f_Group>, Flags<[CC1Option]>;
	def : Separate<["-"], "fnew-alignment">, Alias<fnew_alignment_EQ>;
	def : Flag<["-"], "faligned-new">, Alias<faligned_allocation>;
	def : Flag<["-"], "fno-aligned-new">, Alias<fno_aligned_allocation>;
	def faligned_new_EQ : Joined<["-"], "faligned-new=">;

	def fobjc_legacy_dispatch : Flag<["-"], "fobjc-legacy-dispatch">, Group<f_Group>;
	def fobjc_new_property : Flag<["-"], "fobjc-new-property">, Group<clang_ignored_f_Group>;
	def fobjc_infer_related_result_type : Flag<["-"], "fobjc-infer-related-result-type">,
	Group<f_Group>;
	def fno_objc_infer_related_result_type : Flag<["-"],
	"fno-objc-infer-related-result-type">, Group<f_Group>,
	HelpText<
	"do not infer Objective-C related result type based on method family">,
	Flags<[CC1Option]>;
	def fobjc_link_runtime: Flag<["-"], "fobjc-link-runtime">, Group<f_Group>;
	def fobjc_weak : Flag<["-"], "fobjc-weak">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable ARC-style weak references in Objective-C">;

	// Objective-C ABI options.
	def fobjc_runtime_EQ : Joined<["-"], "fobjc-runtime=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Specify the target Objective-C runtime kind and version">;
	def fobjc_abi_version_EQ : Joined<["-"], "fobjc-abi-version=">, Group<f_Group>;
	def fobjc_nonfragile_abi_version_EQ : Joined<["-"], "fobjc-nonfragile-abi-version=">, Group<f_Group>;
	def fobjc_nonfragile_abi : Flag<["-"], "fobjc-nonfragile-abi">, Group<f_Group>;
	def fno_objc_nonfragile_abi : Flag<["-"], "fno-objc-nonfragile-abi">, Group<f_Group>;

	def fobjc_sender_dependent_dispatch : Flag<["-"], "fobjc-sender-dependent-dispatch">, Group<f_Group>;
	def fomit_frame_pointer : Flag<["-"], "fomit-frame-pointer">, Group<f_Group>;
	def fopenmp : Flag<["-"], "fopenmp">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
	def fno_openmp : Flag<["-"], "fno-openmp">, Group<f_Group>, Flags<[NoArgumentUnused]>;
	def fopenmp_version_EQ : Joined<["-"], "fopenmp-version=">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
	def fopenmp_EQ : Joined<["-"], "fopenmp=">, Group<f_Group>;
	def fopenmp_use_tls : Flag<["-"], "fopenmp-use-tls">, Group<f_Group>, Flags<[NoArgumentUnused]>;
	def fnoopenmp_use_tls : Flag<["-"], "fnoopenmp-use-tls">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
	def fopenmp_targets_EQ : CommaJoined<["-"], "fopenmp-targets=">, Flags<[DriverOption, CC1Option]>,
	HelpText<"Specify comma-separated list of triples OpenMP offloading targets to be supported">;
	def fopenmp_dump_offload_linker_script : Flag<["-"], "fopenmp-dump-offload-linker-script">, Group<f_Group>,
	Flags<[NoArgumentUnused]>;
	def fopenmp_relocatable_target : Flag<["-"], "fopenmp-relocatable-target">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
	HelpText<"OpenMP target code is compiled as relocatable using the -c flag. For OpenMP targets the code is relocatable by default.">;
	def fnoopenmp_relocatable_target : Flag<["-"], "fnoopenmp-relocatable-target">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
	HelpText<"Do not compile OpenMP target code as relocatable.">;
	def fopenmp_simd : Flag<["-"], "fopenmp-simd">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
	HelpText<"Emit OpenMP code only for SIMD-based constructs.">;
	def fno_openmp_simd : Flag<["-"], "fno-openmp-simd">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
	HelpText<"Disable OpenMP code for SIMD-based constructs.">;
	def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>;
	def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group<f_Group>;
	def force__cpusubtype__ALL : Flag<["-"], "force_cpusubtype_ALL">;
	def force__flat__namespace : Flag<["-"], "force_flat_namespace">;
	def force__load : Separate<["-"], "force_load">;
	def force_addr : Joined<["-"], "fforce-addr">, Group<clang_ignored_f_Group>;
	def foutput_class_dir_EQ : Joined<["-"], "foutput-class-dir=">, Group<f_Group>;
	def fpack_struct : Flag<["-"], "fpack-struct">, Group<f_Group>;
	def fno_pack_struct : Flag<["-"], "fno-pack-struct">, Group<f_Group>;
	def fpack_struct_EQ : Joined<["-"], "fpack-struct=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Specify the default maximum struct packing alignment">;
	def fmax_type_align_EQ : Joined<["-"], "fmax-type-align=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Specify the maximum alignment to enforce on pointers lacking an explicit alignment">;
	def fno_max_type_align : Flag<["-"], "fno-max-type-align">, Group<f_Group>;
	def fpascal_strings : Flag<["-"], "fpascal-strings">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Recognize and construct Pascal-style string literals">;
	def fpcc_struct_return : Flag<["-"], "fpcc-struct-return">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Override the default ABI to return all structs on the stack">;
	def fpch_preprocess : Flag<["-"], "fpch-preprocess">, Group<f_Group>;
	def fpic : Flag<["-"], "fpic">, Group<f_Group>;
	def fno_pic : Flag<["-"], "fno-pic">, Group<f_Group>;
	def fpie : Flag<["-"], "fpie">, Group<f_Group>;
	def fno_pie : Flag<["-"], "fno-pie">, Group<f_Group>;
	def fplt : Flag<["-"], "fplt">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use the PLT to make function calls">;
	def fno_plt : Flag<["-"], "fno-plt">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Do not use the PLT to make function calls">;
	def fropi : Flag<["-"], "fropi">, Group<f_Group>;
	def fno_ropi : Flag<["-"], "fno-ropi">, Group<f_Group>;
	def frwpi : Flag<["-"], "frwpi">, Group<f_Group>;
	def fno_rwpi : Flag<["-"], "fno-rwpi">, Group<f_Group>;
	def fplugin_EQ : Joined<["-"], "fplugin=">, Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<dsopath>">,
	HelpText<"Load the named plugin (dynamic shared object)">;
	def fpreserve_as_comments : Flag<["-"], "fpreserve-as-comments">, Group<f_Group>;
	def fno_preserve_as_comments : Flag<["-"], "fno-preserve-as-comments">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Do not preserve comments in inline assembly">;
	def fprofile_arcs : Flag<["-"], "fprofile-arcs">, Group<f_Group>;
	def fno_profile_arcs : Flag<["-"], "fno-profile-arcs">, Group<f_Group>;
	def framework : Separate<["-"], "framework">, Flags<[LinkerInput]>;
	def frandom_seed_EQ : Joined<["-"], "frandom-seed=">, Group<clang_ignored_f_Group>;
	def freg_struct_return : Flag<["-"], "freg-struct-return">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Override the default ABI to return small structs in registers">;
	def frtti : Flag<["-"], "frtti">, Group<f_Group>;
	def : Flag<["-"], "fsched-interblock">, Group<clang_ignored_f_Group>;
	def fshort_enums : Flag<["-"], "fshort-enums">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allocate to an enum type only as many bytes as it needs for the declared range of possible values">;
	def fshort_wchar : Flag<["-"], "fshort-wchar">, Group<f_Group>,
	HelpText<"Force wchar_t to be a short unsigned int">;
	def fno_short_wchar : Flag<["-"], "fno-short-wchar">, Group<f_Group>,
	HelpText<"Force wchar_t to be an unsigned int">;
	def fshow_overloads_EQ : Joined<["-"], "fshow-overloads=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Which overload candidates to show when overload resolution fails: "
	"best\|all; defaults to all">, Values<"best,all">;
	def fshow_column : Flag<["-"], "fshow-column">, Group<f_Group>, Flags<[CC1Option]>;
	def fshow_source_location : Flag<["-"], "fshow-source-location">, Group<f_Group>;
	def fspell_checking : Flag<["-"], "fspell-checking">, Group<f_Group>;
	def fspell_checking_limit_EQ : Joined<["-"], "fspell-checking-limit=">, Group<f_Group>;
	def fsigned_bitfields : Flag<["-"], "fsigned-bitfields">, Group<f_Group>;
	def fsigned_char : Flag<["-"], "fsigned-char">, Group<f_Group>;
	def fno_signed_char : Flag<["-"], "fno-signed-char">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Char is unsigned">;
	def fsplit_stack : Flag<["-"], "fsplit-stack">, Group<f_Group>;
	def fstack_protector_all : Flag<["-"], "fstack-protector-all">, Group<f_Group>,
	HelpText<"Force the usage of stack protectors for all functions">;
	def fstack_protector_strong : Flag<["-"], "fstack-protector-strong">, Group<f_Group>,
	HelpText<"Use a strong heuristic to apply stack protectors to functions">;
	def fstack_protector : Flag<["-"], "fstack-protector">, Group<f_Group>,
	HelpText<"Enable stack protectors for functions potentially vulnerable to stack smashing">;
	def fstandalone_debug : Flag<["-"], "fstandalone-debug">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Emit full debug info for all types used by the program">;
	def fno_standalone_debug : Flag<["-"], "fno-standalone-debug">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Limit debug information produced to reduce size of debug binary">;
	def flimit_debug_info : Flag<["-"], "flimit-debug-info">, Flags<[CoreOption]>, Alias<fno_standalone_debug>;
	def fno_limit_debug_info : Flag<["-"], "fno-limit-debug-info">, Flags<[CoreOption]>, Alias<fstandalone_debug>;
	def fdebug_macro : Flag<["-"], "fdebug-macro">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Emit macro debug information">;
	def fno_debug_macro : Flag<["-"], "fno-debug-macro">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Do not emit macro debug information">;
	def fstrict_aliasing : Flag<["-"], "fstrict-aliasing">, Group<f_Group>,
	Flags<[DriverOption, CoreOption]>;
	def fstrict_enums : Flag<["-"], "fstrict-enums">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable optimizations based on the strict definition of an enum's "
	"value range">;
	def fstrict_vtable_pointers: Flag<["-"], "fstrict-vtable-pointers">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable optimizations based on the strict rules for overwriting "
	"polymorphic C++ objects">;
	def fstrict_overflow : Flag<["-"], "fstrict-overflow">, Group<f_Group>;
	def fsyntax_only : Flag<["-"], "fsyntax-only">,
	Flags<[DriverOption,CoreOption,CC1Option]>, Group<Action_Group>;
	def ftabstop_EQ : Joined<["-"], "ftabstop=">, Group<f_Group>;
	def ftemplate_depth_EQ : Joined<["-"], "ftemplate-depth=">, Group<f_Group>;
	def ftemplate_depth_ : Joined<["-"], "ftemplate-depth-">, Group<f_Group>;
	def ftemplate_backtrace_limit_EQ : Joined<["-"], "ftemplate-backtrace-limit=">,
	Group<f_Group>;
	def foperator_arrow_depth_EQ : Joined<["-"], "foperator-arrow-depth=">,
	Group<f_Group>;

	def fsave_optimization_record : Flag<["-"], "fsave-optimization-record">,
	Group<f_Group>, HelpText<"Generate a YAML optimization record file">;
	def fno_save_optimization_record : Flag<["-"], "fno-save-optimization-record">,
	Group<f_Group>, Flags<[NoArgumentUnused]>;
	def foptimization_record_file_EQ : Joined<["-"], "foptimization-record-file=">,
	Group<f_Group>,
	HelpText<"Specify the file name of any generated YAML optimization record">;

	def ftest_coverage : Flag<["-"], "ftest-coverage">, Group<f_Group>;
	def fvectorize : Flag<["-"], "fvectorize">, Group<f_Group>,
	HelpText<"Enable the loop vectorization passes">;
	def fno_vectorize : Flag<["-"], "fno-vectorize">, Group<f_Group>;
	def : Flag<["-"], "ftree-vectorize">, Alias<fvectorize>;
	def : Flag<["-"], "fno-tree-vectorize">, Alias<fno_vectorize>;
	def fslp_vectorize : Flag<["-"], "fslp-vectorize">, Group<f_Group>,
	HelpText<"Enable the superword-level parallelism vectorization passes">;
	def fno_slp_vectorize : Flag<["-"], "fno-slp-vectorize">, Group<f_Group>;
	def : Flag<["-"], "ftree-slp-vectorize">, Alias<fslp_vectorize>;
	def : Flag<["-"], "fno-tree-slp-vectorize">, Alias<fno_slp_vectorize>;
	def Wlarge_by_value_copy_def : Flag<["-"], "Wlarge-by-value-copy">,
	HelpText<"Warn if a function definition returns or accepts an object larger "
	"in bytes than a given value">, Flags<[HelpHidden]>;
	def Wlarge_by_value_copy_EQ : Joined<["-"], "Wlarge-by-value-copy=">, Flags<[CC1Option]>;

	// These "special" warning flags are effectively processed as f_Group flags by the driver:
	// Just silence warnings about -Wlarger-than for now.
	def Wlarger_than_EQ : Joined<["-"], "Wlarger-than=">, Group<clang_ignored_f_Group>;
	def Wlarger_than_ : Joined<["-"], "Wlarger-than-">, Alias<Wlarger_than_EQ>;
	def Wframe_larger_than_EQ : Joined<["-"], "Wframe-larger-than=">, Group<f_Group>, Flags<[DriverOption]>;

	def : Flag<["-"], "fterminated-vtables">, Alias<fapple_kext>;
	def fthreadsafe_statics : Flag<["-"], "fthreadsafe-statics">, Group<f_Group>;
	def ftime_report : Flag<["-"], "ftime-report">, Group<f_Group>, Flags<[CC1Option]>;
	def ftlsmodel_EQ : Joined<["-"], "ftls-model=">, Group<f_Group>, Flags<[CC1Option]>;
	def ftrapv : Flag<["-"], "ftrapv">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Trap on integer overflow">;
	def ftrapv_handler_EQ : Joined<["-"], "ftrapv-handler=">, Group<f_Group>,
	MetaVarName<"<function name>">,
	HelpText<"Specify the function to be called on overflow">;
	def ftrapv_handler : Separate<["-"], "ftrapv-handler">, Group<f_Group>, Flags<[CC1Option]>;
	def ftrap_function_EQ : Joined<["-"], "ftrap-function=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Issue call to specified function rather than a trap instruction">;
	def funit_at_a_time : Flag<["-"], "funit-at-a-time">, Group<f_Group>;
	def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>,
	HelpText<"Turn on loop unroller">, Flags<[CC1Option]>;
	def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>,
	HelpText<"Turn off loop unroller">, Flags<[CC1Option]>;
	def freroll_loops : Flag<["-"], "freroll-loops">, Group<f_Group>,
	HelpText<"Turn on loop reroller">, Flags<[CC1Option]>;
	def fno_reroll_loops : Flag<["-"], "fno-reroll-loops">, Group<f_Group>,
	HelpText<"Turn off loop reroller">;
	def ftrigraphs : Flag<["-"], "ftrigraphs">, Group<f_Group>,
	HelpText<"Process trigraph sequences">, Flags<[CC1Option]>;
	def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group<f_Group>,
	HelpText<"Do not process trigraph sequences">, Flags<[CC1Option]>;
	def funsigned_bitfields : Flag<["-"], "funsigned-bitfields">, Group<f_Group>;
	def funsigned_char : Flag<["-"], "funsigned-char">, Group<f_Group>;
	def fno_unsigned_char : Flag<["-"], "fno-unsigned-char">;
	def funwind_tables : Flag<["-"], "funwind-tables">, Group<f_Group>;
	def fuse_cxa_atexit : Flag<["-"], "fuse-cxa-atexit">, Group<f_Group>;
	def fuse_init_array : Flag<["-"], "fuse-init-array">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use .init_array instead of .ctors">;
	def fno_var_tracking : Flag<["-"], "fno-var-tracking">, Group<clang_ignored_f_Group>;
	def fverbose_asm : Flag<["-"], "fverbose-asm">, Group<f_Group>;
	def fvisibility_EQ : Joined<["-"], "fvisibility=">, Group<f_Group>,
	HelpText<"Set the default symbol visibility for all global declarations">, Values<"hidden,default">;
	def fvisibility_inlines_hidden : Flag<["-"], "fvisibility-inlines-hidden">, Group<f_Group>,
	HelpText<"Give inline C++ member functions default visibility by default">,
	Flags<[CC1Option]>;
	def fvisibility_ms_compat : Flag<["-"], "fvisibility-ms-compat">, Group<f_Group>,
	HelpText<"Give global types 'default' visibility and global functions and "
	"variables 'hidden' visibility by default">;
	def fwhole_program_vtables : Flag<["-"], "fwhole-program-vtables">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>,
	HelpText<"Enables whole-program vtable optimization. Requires -flto">;
	def fno_whole_program_vtables : Flag<["-"], "fno-whole-program-vtables">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fwrapv : Flag<["-"], "fwrapv">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Treat signed integer overflow as two's complement">;
	def fwritable_strings : Flag<["-"], "fwritable-strings">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Store string literals as writable data">;
	def fzero_initialized_in_bss : Flag<["-"], "fzero-initialized-in-bss">, Group<f_Group>;
	def ffunction_sections : Flag<["-"], "ffunction-sections">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Place each function in its own section (ELF Only)">;
	def fno_function_sections : Flag<["-"], "fno-function-sections">,
	Group<f_Group>, Flags<[CC1Option]>;
	def fdata_sections : Flag <["-"], "fdata-sections">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Place each data in its own section (ELF Only)">;
	def fno_data_sections : Flag <["-"], "fno-data-sections">, Group<f_Group>,
	Flags<[CC1Option]>;

	def funique_section_names : Flag <["-"], "funique-section-names">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use unique names for text and data sections (ELF Only)">;
	def fno_unique_section_names : Flag <["-"], "fno-unique-section-names">,
	Group<f_Group>, Flags<[CC1Option]>;

	def fstrict_return : Flag<["-"], "fstrict-return">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Always treat control flow paths that fall off the end of a "
	"non-void function as unreachable">;
	def fno_strict_return : Flag<["-"], "fno-strict-return">, Group<f_Group>,
	Flags<[CC1Option]>;

	def fallow_editor_placeholders : Flag<["-"], "fallow-editor-placeholders">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Treat editor placeholders as valid source code">;
	def fno_allow_editor_placeholders : Flag<["-"],
	"fno-allow-editor-placeholders">, Group<f_Group>;

	def fdebug_types_section: Flag <["-"], "fdebug-types-section">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Place debug types in their own section (ELF Only)">;
	def fno_debug_types_section: Flag<["-"], "fno-debug-types-section">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fsplit_dwarf_inlining: Flag <["-"], "fsplit-dwarf-inlining">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Place debug types in their own section (ELF Only)">;
	def fno_split_dwarf_inlining: Flag<["-"], "fno-split-dwarf-inlining">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fdebug_prefix_map_EQ
	: Joined<["-"], "fdebug-prefix-map=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"remap file source paths in debug info">;
	def g_Flag : Flag<["-"], "g">, Group<g_Group>,
	HelpText<"Generate source-level debug information">;
	def gline_tables_only : Flag<["-"], "gline-tables-only">, Group<gN_Group>,
	Flags<[CoreOption]>, HelpText<"Emit debug line number tables only">;
	def gmlt : Flag<["-"], "gmlt">, Alias<gline_tables_only>;
	def g0 : Flag<["-"], "g0">, Group<gN_Group>;
	def g1 : Flag<["-"], "g1">, Group<gN_Group>, Alias<gline_tables_only>;
	def g2 : Flag<["-"], "g2">, Group<gN_Group>;
	def g3 : Flag<["-"], "g3">, Group<gN_Group>;
	def ggdb : Flag<["-"], "ggdb">, Group<gTune_Group>;
	def ggdb0 : Flag<["-"], "ggdb0">, Group<ggdbN_Group>;
	def ggdb1 : Flag<["-"], "ggdb1">, Group<ggdbN_Group>;
	def ggdb2 : Flag<["-"], "ggdb2">, Group<ggdbN_Group>;
	def ggdb3 : Flag<["-"], "ggdb3">, Group<ggdbN_Group>;
	def glldb : Flag<["-"], "glldb">, Group<gTune_Group>;
	def gsce : Flag<["-"], "gsce">, Group<gTune_Group>;
	def gdwarf_2 : Flag<["-"], "gdwarf-2">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 2">;
	def gdwarf_3 : Flag<["-"], "gdwarf-3">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 3">;
	def gdwarf_4 : Flag<["-"], "gdwarf-4">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 4">;
	def gdwarf_5 : Flag<["-"], "gdwarf-5">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 5">;
	def gcodeview : Flag<["-"], "gcodeview">,
	HelpText<"Generate CodeView debug information">,
	Flags<[CC1Option, CC1AsOption, CoreOption]>;
	// Equivalent to our default dwarf version. Forces usual dwarf emission when
	// CodeView is enabled.
	def gdwarf : Flag<["-"], "gdwarf">, Alias<gdwarf_4>, Flags<[CoreOption]>;

	def gfull : Flag<["-"], "gfull">, Group<g_Group>;
	def gused : Flag<["-"], "gused">, Group<g_Group>;
	def gstabs : Joined<["-"], "gstabs">, Group<g_Group>, Flags<[Unsupported]>;
	def gcoff : Joined<["-"], "gcoff">, Group<g_Group>, Flags<[Unsupported]>;
	def gxcoff : Joined<["-"], "gxcoff">, Group<g_Group>, Flags<[Unsupported]>;
	def gvms : Joined<["-"], "gvms">, Group<g_Group>, Flags<[Unsupported]>;
	def gtoggle : Flag<["-"], "gtoggle">, Group<g_flags_Group>, Flags<[Unsupported]>;
	def grecord_gcc_switches : Flag<["-"], "grecord-gcc-switches">, Group<g_flags_Group>;
	def gno_record_gcc_switches : Flag<["-"], "gno-record-gcc-switches">,
	Group<g_flags_Group>;
	def gstrict_dwarf : Flag<["-"], "gstrict-dwarf">, Group<g_flags_Group>;
	def gno_strict_dwarf : Flag<["-"], "gno-strict-dwarf">, Group<g_flags_Group>;
	def gcolumn_info : Flag<["-"], "gcolumn-info">, Group<g_flags_Group>, Flags<[CoreOption]>;
	def gno_column_info : Flag<["-"], "gno-column-info">, Group<g_flags_Group>, Flags<[CoreOption]>;
	def gsplit_dwarf : Flag<["-"], "gsplit-dwarf">, Group<g_flags_Group>;
	def ggnu_pubnames : Flag<["-"], "ggnu-pubnames">, Group<g_flags_Group>, Flags<[CC1Option]>;
	def gdwarf_aranges : Flag<["-"], "gdwarf-aranges">, Group<g_flags_Group>;
	def gmodules : Flag <["-"], "gmodules">, Group<gN_Group>,
	HelpText<"Generate debug info with external references to clang modules"
	" or precompiled headers">;
	def gz : Flag<["-"], "gz">, Group<g_flags_Group>,
	HelpText<"DWARF debug sections compression type">;
	def gz_EQ : Joined<["-"], "gz=">, Group<g_flags_Group>,
	HelpText<"DWARF debug sections compression type">;
	def headerpad__max__install__names : Joined<["-"], "headerpad_max_install_names">;
	def help : Flag<["-", "--"], "help">, Flags<[CC1Option,CC1AsOption]>,
	HelpText<"Display available options">;
	def index_header_map : Flag<["-"], "index-header-map">, Flags<[CC1Option]>,
	HelpText<"Make the next included directory (-I or -F) an indexer header map">;
	def idirafter : JoinedOrSeparate<["-"], "idirafter">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Add directory to AFTER include search path">;
	def iframework : JoinedOrSeparate<["-"], "iframework">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Add directory to SYSTEM framework search path">;
	def iframeworkwithsysroot : JoinedOrSeparate<["-"], "iframeworkwithsysroot">,
	Group<clang_i_Group>,
	HelpText<"Add directory to SYSTEM framework search path, "
	"absolute paths are relative to -isysroot">,
	MetaVarName<"<directory>">, Flags<[CC1Option]>;
	def imacros : JoinedOrSeparate<["-", "--"], "imacros">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Include macros from file before parsing">, MetaVarName<"<file>">;
	def image__base : Separate<["-"], "image_base">;
	def include_ : JoinedOrSeparate<["-", "--"], "include">, Group<clang_i_Group>, EnumName<"include">,
	MetaVarName<"<file>">, HelpText<"Include file before parsing">, Flags<[CC1Option]>;
	def include_pch : Separate<["-"], "include-pch">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Include precompiled header file">, MetaVarName<"<file>">;
	def relocatable_pch : Flag<["-", "--"], "relocatable-pch">, Flags<[CC1Option]>,
	HelpText<"Whether to build a relocatable precompiled header">;
	def verify_pch : Flag<["-"], "verify-pch">, Group<Action_Group>, Flags<[CC1Option]>,
	HelpText<"Load and verify that a pre-compiled header file is not stale">;
	def init : Separate<["-"], "init">;
	def install__name : Separate<["-"], "install_name">;
	def iprefix : JoinedOrSeparate<["-"], "iprefix">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Set the -iwithprefix/-iwithprefixbefore prefix">, MetaVarName<"<dir>">;
	def iquote : JoinedOrSeparate<["-"], "iquote">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Add directory to QUOTE include search path">, MetaVarName<"<directory>">;
	def isysroot : JoinedOrSeparate<["-"], "isysroot">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Set the system root directory (usually /)">, MetaVarName<"<dir>">;
	def isystem : JoinedOrSeparate<["-"], "isystem">, Group<clang_i_Group>,
	Flags<[CC1Option]>,
	HelpText<"Add directory to SYSTEM include search path">, MetaVarName<"<directory>">;
	def isystem_after : JoinedOrSeparate<["-"], "isystem-after">,
	Group<clang_i_Group>, Flags<[DriverOption]>, MetaVarName<"<directory>">,
	HelpText<"Add directory to end of the SYSTEM include search path">;
	def iwithprefixbefore : JoinedOrSeparate<["-"], "iwithprefixbefore">, Group<clang_i_Group>,
	HelpText<"Set directory to include search path with prefix">, MetaVarName<"<dir>">,
	Flags<[CC1Option]>;
	def iwithprefix : JoinedOrSeparate<["-"], "iwithprefix">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Set directory to SYSTEM include search path with prefix">, MetaVarName<"<dir>">;
	def iwithsysroot : JoinedOrSeparate<["-"], "iwithsysroot">, Group<clang_i_Group>,
	HelpText<"Add directory to SYSTEM include search path, "
	"absolute paths are relative to -isysroot">, MetaVarName<"<directory>">,
	Flags<[CC1Option]>;
	def ivfsoverlay : JoinedOrSeparate<["-"], "ivfsoverlay">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Overlay the virtual filesystem described by file over the real file system">;
	def i : Joined<["-"], "i">, Group<i_Group>;
	def keep__private__externs : Flag<["-"], "keep_private_externs">;
	def l : JoinedOrSeparate<["-"], "l">, Flags<[LinkerInput, RenderJoined]>,
	Group<Link_Group>;
	def lazy__framework : Separate<["-"], "lazy_framework">, Flags<[LinkerInput]>;
	def lazy__library : Separate<["-"], "lazy_library">, Flags<[LinkerInput]>;
	def mlittle_endian : Flag<["-"], "mlittle-endian">, Flags<[DriverOption]>;
	def EL : Flag<["-"], "EL">, Alias<mlittle_endian>;
	def mbig_endian : Flag<["-"], "mbig-endian">, Flags<[DriverOption]>;
	def EB : Flag<["-"], "EB">, Alias<mbig_endian>;
	def m16 : Flag<["-"], "m16">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def m32 : Flag<["-"], "m32">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def mqdsp6_compat : Flag<["-"], "mqdsp6-compat">, Group<m_Group>, Flags<[DriverOption,CC1Option]>,
	HelpText<"Enable hexagon-qdsp6 backward compatibility">;
	def m64 : Flag<["-"], "m64">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def mx32 : Flag<["-"], "mx32">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def mabi_EQ : Joined<["-"], "mabi=">, Group<m_Group>;
	def miamcu : Flag<["-"], "miamcu">, Group<m_Group>, Flags<[DriverOption, CoreOption]>,
	HelpText<"Use Intel MCU ABI">;
	def mno_iamcu : Flag<["-"], "mno-iamcu">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def malign_functions_EQ : Joined<["-"], "malign-functions=">, Group<clang_ignored_m_Group>;
	def malign_loops_EQ : Joined<["-"], "malign-loops=">, Group<clang_ignored_m_Group>;
	def malign_jumps_EQ : Joined<["-"], "malign-jumps=">, Group<clang_ignored_m_Group>;
	def mfancy_math_387 : Flag<["-"], "mfancy-math-387">, Group<clang_ignored_m_Group>;
	def mlong_calls : Flag<["-"], "mlong-calls">, Group<m_Group>,
	HelpText<"Generate branches with extended addressability, usually via indirect jumps.">;
	def mno_long_calls : Flag<["-"], "mno-long-calls">, Group<m_Group>,
	HelpText<"Restore the default behaviour of not generating long calls">;
	def mexecute_only : Flag<["-"], "mexecute-only">, Group<m_arm_Features_Group>,
	HelpText<"Disallow generation of data access to code sections (ARM only)">;
	def mno_execute_only : Flag<["-"], "mno-execute-only">, Group<m_arm_Features_Group>,
	HelpText<"Allow generation of data access to code sections (ARM only)">;
	def mtp_mode_EQ : Joined<["-"], "mtp=">, Group<m_arm_Features_Group>, Values<"soft, cp15">,
	HelpText<"Read thread pointer from coprocessor register (ARM only)">;
	def mpure_code : Flag<["-"], "mpure-code">, Alias<mexecute_only>; // Alias for GCC compatibility
	def mno_pure_code : Flag<["-"], "mno-pure-code">, Alias<mno_execute_only>;
	def mtvos_version_min_EQ : Joined<["-"], "mtvos-version-min=">, Group<m_Group>;
	def mappletvos_version_min_EQ : Joined<["-"], "mappletvos-version-min=">, Alias<mtvos_version_min_EQ>;
	def mtvos_simulator_version_min_EQ : Joined<["-"], "mtvos-simulator-version-min=">;
	def mappletvsimulator_version_min_EQ : Joined<["-"], "mappletvsimulator-version-min=">, Alias<mtvos_simulator_version_min_EQ>;
	def mwatchos_version_min_EQ : Joined<["-"], "mwatchos-version-min=">, Group<m_Group>;
	def mwatchos_simulator_version_min_EQ : Joined<["-"], "mwatchos-simulator-version-min=">;
	def mwatchsimulator_version_min_EQ : Joined<["-"], "mwatchsimulator-version-min=">, Alias<mwatchos_simulator_version_min_EQ>;
	def march_EQ : Joined<["-"], "march=">, Group<m_Group>;
	def masm_EQ : Joined<["-"], "masm=">, Group<m_Group>, Flags<[DriverOption]>;
	def mcmodel_EQ : Joined<["-"], "mcmodel=">, Group<m_Group>;
	def mimplicit_it_EQ : Joined<["-"], "mimplicit-it=">, Group<m_Group>;
	def mdefault_build_attributes : Joined<["-"], "mdefault-build-attributes">, Group<m_Group>;
	def mno_default_build_attributes : Joined<["-"], "mno-default-build-attributes">, Group<m_Group>;
	def mconstant_cfstrings : Flag<["-"], "mconstant-cfstrings">, Group<clang_ignored_m_Group>;
	def mconsole : Joined<["-"], "mconsole">, Group<m_Group>, Flags<[DriverOption]>;
	def mwindows : Joined<["-"], "mwindows">, Group<m_Group>, Flags<[DriverOption]>;
	def mdll : Joined<["-"], "mdll">, Group<m_Group>, Flags<[DriverOption]>;
	def municode : Joined<["-"], "municode">, Group<m_Group>, Flags<[DriverOption]>;
	def mthreads : Joined<["-"], "mthreads">, Group<m_Group>, Flags<[DriverOption]>;
	def mcpu_EQ : Joined<["-"], "mcpu=">, Group<m_Group>;
	def mmcu_EQ : Joined<["-"], "mmcu=">, Group<m_Group>;
	def mdynamic_no_pic : Joined<["-"], "mdynamic-no-pic">, Group<m_Group>;
	def mfix_and_continue : Flag<["-"], "mfix-and-continue">, Group<clang_ignored_m_Group>;
	def mieee_fp : Flag<["-"], "mieee-fp">, Group<clang_ignored_m_Group>;
	def minline_all_stringops : Flag<["-"], "minline-all-stringops">, Group<clang_ignored_m_Group>;
	def mno_inline_all_stringops : Flag<["-"], "mno-inline-all-stringops">, Group<clang_ignored_m_Group>;
	def malign_double : Flag<["-"], "malign-double">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Align doubles to two words in structs (x86 only)">;
	def mfloat_abi_EQ : Joined<["-"], "mfloat-abi=">, Group<m_Group>, Values<"soft,softfp,hard">;
	def mfpmath_EQ : Joined<["-"], "mfpmath=">, Group<m_Group>;
	def mfpu_EQ : Joined<["-"], "mfpu=">, Group<m_Group>;
	def mhwdiv_EQ : Joined<["-"], "mhwdiv=">, Group<m_Group>;
	def mglobal_merge : Flag<["-"], "mglobal-merge">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Enable merging of globals">;
	def mhard_float : Flag<["-"], "mhard-float">, Group<m_Group>;
	def miphoneos_version_min_EQ : Joined<["-"], "miphoneos-version-min=">, Group<m_Group>;
	def mios_version_min_EQ : Joined<["-"], "mios-version-min=">,
	Alias<miphoneos_version_min_EQ>, HelpText<"Set iOS deployment target">;
	def mios_simulator_version_min_EQ : Joined<["-"], "mios-simulator-version-min=">;
	def miphonesimulator_version_min_EQ : Joined<["-"], "miphonesimulator-version-min=">, Alias<mios_simulator_version_min_EQ>;
	def mkernel : Flag<["-"], "mkernel">, Group<m_Group>;
	def mlinker_version_EQ : Joined<["-"], "mlinker-version=">,
	Flags<[DriverOption]>;
	def mllvm : Separate<["-"], "mllvm">, Flags<[CC1Option,CC1AsOption,CoreOption]>,
	HelpText<"Additional arguments to forward to LLVM's option processing">;
	def mmacosx_version_min_EQ : Joined<["-"], "mmacosx-version-min=">,
	Group<m_Group>, HelpText<"Set Mac OS X deployment target">;
	def mmacos_version_min_EQ : Joined<["-"], "mmacos-version-min=">,
	Group<m_Group>, Alias<mmacosx_version_min_EQ>;
	def mms_bitfields : Flag<["-"], "mms-bitfields">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set the default structure layout to be compatible with the Microsoft compiler standard">;
	def mno_ms_bitfields : Flag<["-"], "mno-ms-bitfields">, Group<m_Group>,
	HelpText<"Do not set the default structure layout to be compatible with the Microsoft compiler standard">;
	def mstackrealign : Flag<["-"], "mstackrealign">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Force realign the stack at entry to every function">;
	def mstack_alignment : Joined<["-"], "mstack-alignment=">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set the stack alignment">;
	def mstack_probe_size : Joined<["-"], "mstack-probe-size=">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set the stack probe size">;
	def mthread_model : Separate<["-"], "mthread-model">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"The thread model to use, e.g. posix, single (posix by default)">, Values<"posix,single">;
	def meabi : Separate<["-"], "meabi">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set EABI type, e.g. 4, 5 or gnu (default depends on triple)">, Values<"default,4,5,gnu">;

	def mno_constant_cfstrings : Flag<["-"], "mno-constant-cfstrings">, Group<m_Group>;
	def mno_global_merge : Flag<["-"], "mno-global-merge">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Disable merging of globals">;
	def mno_pascal_strings : Flag<["-"], "mno-pascal-strings">,
	Alias<fno_pascal_strings>;
	def mno_red_zone : Flag<["-"], "mno-red-zone">, Group<m_Group>;
	def mno_relax_all : Flag<["-"], "mno-relax-all">, Group<m_Group>;
	def mno_rtd: Flag<["-"], "mno-rtd">, Group<m_Group>;
	def mno_soft_float : Flag<["-"], "mno-soft-float">, Group<m_Group>;
	def mno_stackrealign : Flag<["-"], "mno-stackrealign">, Group<m_Group>;

	def munaligned_access : Flag<["-"], "munaligned-access">, Group<m_arm_Features_Group>,
	HelpText<"Allow memory accesses to be unaligned (AArch32/AArch64 only)">;
	def mno_unaligned_access : Flag<["-"], "mno-unaligned-access">, Group<m_arm_Features_Group>,
	HelpText<"Force all memory accesses to be aligned (AArch32/AArch64 only)">;
	def mstrict_align : Flag<["-"], "mstrict-align">, Alias<mno_unaligned_access>, Flags<[CC1Option,HelpHidden]>,
	HelpText<"Force all memory accesses to be aligned (same as mno-unaligned-access)">;
	def mno_thumb : Flag<["-"], "mno-thumb">, Group<m_arm_Features_Group>;
	def mrestrict_it: Flag<["-"], "mrestrict-it">, Group<m_arm_Features_Group>,
	HelpText<"Disallow generation of deprecated IT blocks for ARMv8. It is on by default for ARMv8 Thumb mode.">;
	def mno_restrict_it: Flag<["-"], "mno-restrict-it">, Group<m_arm_Features_Group>,
	HelpText<"Allow generation of deprecated IT blocks for ARMv8. It is off by default for ARMv8 Thumb mode">;
	def marm : Flag<["-"], "marm">, Alias<mno_thumb>;
	def ffixed_r9 : Flag<["-"], "ffixed-r9">, Group<m_arm_Features_Group>,
	HelpText<"Reserve the r9 register (ARM only)">;
	def mno_movt : Flag<["-"], "mno-movt">, Group<m_arm_Features_Group>,
	HelpText<"Disallow use of movt/movw pairs (ARM only)">;
	def mcrc : Flag<["-"], "mcrc">, Group<m_arm_Features_Group>,
	HelpText<"Allow use of CRC instructions (ARM only)">;
	def mnocrc : Flag<["-"], "mnocrc">, Group<m_arm_Features_Group>,
	HelpText<"Disallow use of CRC instructions (ARM only)">;
	def mno_neg_immediates: Flag<["-"], "mno-neg-immediates">, Group<m_arm_Features_Group>,
	HelpText<"Disallow converting instructions with negative immediates to their negation or inversion.">;

	def mgeneral_regs_only : Flag<["-"], "mgeneral-regs-only">, Group<m_aarch64_Features_Group>,
	HelpText<"Generate code which only uses the general purpose registers (AArch64 only)">;
	def mfix_cortex_a53_835769 : Flag<["-"], "mfix-cortex-a53-835769">,
	Group<m_aarch64_Features_Group>,
	HelpText<"Workaround Cortex-A53 erratum 835769 (AArch64 only)">;
	def mno_fix_cortex_a53_835769 : Flag<["-"], "mno-fix-cortex-a53-835769">,
	Group<m_aarch64_Features_Group>,
	HelpText<"Don't workaround Cortex-A53 erratum 835769 (AArch64 only)">;
	def ffixed_x18 : Flag<["-"], "ffixed-x18">, Group<m_aarch64_Features_Group>,
	HelpText<"Reserve the x18 register (AArch64 only)">;

	def msimd128 : Flag<["-"], "msimd128">, Group<m_wasm_Features_Group>;
	def mno_simd128 : Flag<["-"], "mno-simd128">, Group<m_wasm_Features_Group>;
	def mnontrapping_fptoint : Flag<["-"], "mnontrapping-fptoint">, Group<m_wasm_Features_Group>;
	def mno_nontrapping_fptoint : Flag<["-"], "mno-nontrapping-fptoint">, Group<m_wasm_Features_Group>;

	def mamdgpu_debugger_abi : Joined<["-"], "mamdgpu-debugger-abi=">,
	Flags<[HelpHidden]>,
	Group<m_Group>,
	HelpText<"Generate additional code for specified <version> of debugger ABI (AMDGPU only)">,
	MetaVarName<"<version>">;
	def mxnack : Flag<["-"], "mxnack">, Group<m_amdgpu_Features_Group>,
	HelpText<"Enable XNACK (AMDGPU only)">;
	def mno_xnack : Flag<["-"], "mno-xnack">, Group<m_amdgpu_Features_Group>,
	HelpText<"Disable XNACK (AMDGPU only)">;

	def faltivec : Flag<["-"], "faltivec">, Group<f_Group>, Flags<[DriverOption]>;
	def fno_altivec : Flag<["-"], "fno-altivec">, Group<f_Group>, Flags<[DriverOption]>;
	def maltivec : Flag<["-"], "maltivec">, Group<m_ppc_Features_Group>;
	def mno_altivec : Flag<["-"], "mno-altivec">, Group<m_ppc_Features_Group>;
	def mvsx : Flag<["-"], "mvsx">, Group<m_ppc_Features_Group>;
	def mno_vsx : Flag<["-"], "mno-vsx">, Group<m_ppc_Features_Group>;
	def mpower8_vector : Flag<["-"], "mpower8-vector">,
	Group<m_ppc_Features_Group>;
	def mno_power8_vector : Flag<["-"], "mno-power8-vector">,
	Group<m_ppc_Features_Group>;
	def mpower9_vector : Flag<["-"], "mpower9-vector">,
	Group<m_ppc_Features_Group>;
	def mno_power9_vector : Flag<["-"], "mno-power9-vector">,
	Group<m_ppc_Features_Group>;
	def mpower8_crypto : Flag<["-"], "mcrypto">,
	Group<m_ppc_Features_Group>;
	def mnopower8_crypto : Flag<["-"], "mno-crypto">,
	Group<m_ppc_Features_Group>;
	def mdirect_move : Flag<["-"], "mdirect-move">,
	Group<m_ppc_Features_Group>;
	def mnodirect_move : Flag<["-"], "mno-direct-move">,
	Group<m_ppc_Features_Group>;
	def mhtm : Flag<["-"], "mhtm">, Group<m_ppc_Features_Group>;
	def mno_htm : Flag<["-"], "mno-htm">, Group<m_ppc_Features_Group>;
	def mfprnd : Flag<["-"], "mfprnd">, Group<m_ppc_Features_Group>;
	def mno_fprnd : Flag<["-"], "mno-fprnd">, Group<m_ppc_Features_Group>;
	def mcmpb : Flag<["-"], "mcmpb">, Group<m_ppc_Features_Group>;
	def mno_cmpb : Flag<["-"], "mno-cmpb">, Group<m_ppc_Features_Group>;
	def misel : Flag<["-"], "misel">, Group<m_ppc_Features_Group>;
	def mno_isel : Flag<["-"], "mno-isel">, Group<m_ppc_Features_Group>;
	def mmfocrf : Flag<["-"], "mmfocrf">, Group<m_ppc_Features_Group>;
	def mmfcrf : Flag<["-"], "mmfcrf">, Alias<mmfocrf>;
	def mno_mfocrf : Flag<["-"], "mno-mfocrf">, Group<m_ppc_Features_Group>;
	def mno_mfcrf : Flag<["-"], "mno-mfcrf">, Alias<mno_mfocrf>;
	def mpopcntd : Flag<["-"], "mpopcntd">, Group<m_ppc_Features_Group>;
	def mno_popcntd : Flag<["-"], "mno-popcntd">, Group<m_ppc_Features_Group>;
	def mqpx : Flag<["-"], "mqpx">, Group<m_ppc_Features_Group>;
	def mno_qpx : Flag<["-"], "mno-qpx">, Group<m_ppc_Features_Group>;
	def mcrbits : Flag<["-"], "mcrbits">, Group<m_ppc_Features_Group>;
	def mno_crbits : Flag<["-"], "mno-crbits">, Group<m_ppc_Features_Group>;
	def minvariant_function_descriptors :
	Flag<["-"], "minvariant-function-descriptors">, Group<m_ppc_Features_Group>;
	def mno_invariant_function_descriptors :
	Flag<["-"], "mno-invariant-function-descriptors">,
	Group<m_ppc_Features_Group>;
	def mfloat128: Flag<["-"], "mfloat128">,
	Group<m_ppc_Features_Group>;
	def mno_float128 : Flag<["-"], "mno-float128">,
	Group<m_ppc_Features_Group>;
	def mlongcall: Flag<["-"], "mlongcall">,
	Group<m_ppc_Features_Group>;
	def mno_longcall : Flag<["-"], "mno-longcall">,
	Group<m_ppc_Features_Group>;

	def mvx : Flag<["-"], "mvx">, Group<m_Group>;
	def mno_vx : Flag<["-"], "mno-vx">, Group<m_Group>;

	def fzvector : Flag<["-"], "fzvector">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable System z vector language extension">;
	def fno_zvector : Flag<["-"], "fno-zvector">, Group<f_Group>,
	Flags<[CC1Option]>;
	def mzvector : Flag<["-"], "mzvector">, Alias<fzvector>;
	def mno_zvector : Flag<["-"], "mno-zvector">, Alias<fno_zvector>;

	def mbackchain : Flag<["-"], "mbackchain">, Group<m_Group>, Flags<[DriverOption,CC1Option]>,
	HelpText<"Link stack frames through backchain on System Z">;
	def mno_backchain : Flag<["-"], "mno-backchain">, Group<m_Group>, Flags<[DriverOption,CC1Option]>;

	def mno_warn_nonportable_cfstrings : Flag<["-"], "mno-warn-nonportable-cfstrings">, Group<m_Group>;
	def mno_omit_leaf_frame_pointer : Flag<["-"], "mno-omit-leaf-frame-pointer">, Group<m_Group>;
	def momit_leaf_frame_pointer : Flag<["-"], "momit-leaf-frame-pointer">, Group<m_Group>,
	HelpText<"Omit frame pointer setup for leaf functions">, Flags<[CC1Option]>;
	def moslib_EQ : Joined<["-"], "moslib=">, Group<m_Group>;
	def mpascal_strings : Flag<["-"], "mpascal-strings">, Alias<fpascal_strings>;
	def mred_zone : Flag<["-"], "mred-zone">, Group<m_Group>;
	def mregparm_EQ : Joined<["-"], "mregparm=">, Group<m_Group>;
	def mrelax_all : Flag<["-"], "mrelax-all">, Group<m_Group>, Flags<[CC1Option,CC1AsOption]>,
	HelpText<"(integrated-as) Relax all machine instructions">;
	def mincremental_linker_compatible : Flag<["-"], "mincremental-linker-compatible">, Group<m_Group>,
	Flags<[CC1Option,CC1AsOption]>,
	HelpText<"(integrated-as) Emit an object file which can be used with an incremental linker">;
	def mno_incremental_linker_compatible : Flag<["-"], "mno-incremental-linker-compatible">, Group<m_Group>,
	HelpText<"(integrated-as) Emit an object file which cannot be used with an incremental linker">;
	def mrtd : Flag<["-"], "mrtd">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Make StdCall calling convention the default">;
	def msmall_data_threshold_EQ : Joined <["-"], "msmall-data-threshold=">,
	Group<m_Group>, Alias<G>;
	def msoft_float : Flag<["-"], "msoft-float">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Use software floating point">;
	def mno_implicit_float : Flag<["-"], "mno-implicit-float">, Group<m_Group>,
	HelpText<"Don't generate implicit floating point instructions">;
	def mimplicit_float : Flag<["-"], "mimplicit-float">, Group<m_Group>;
	def mrecip : Flag<["-"], "mrecip">, Group<m_Group>;
	def mrecip_EQ : CommaJoined<["-"], "mrecip=">, Group<m_Group>, Flags<[CC1Option]>;
	def mprefer_vector_width_EQ : Joined<["-"], "mprefer-vector-width=">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Specifies preferred vector width for auto-vectorization. Defaults to 'none' which allows target specific decisions.">;
	def mpie_copy_relocations : Flag<["-"], "mpie-copy-relocations">, Group<m_Group>,
	Flags<[CC1Option]>,
	HelpText<"Use copy relocations support for PIE builds">;
	def mno_pie_copy_relocations : Flag<["-"], "mno-pie-copy-relocations">, Group<m_Group>;
	def mfentry : Flag<["-"], "mfentry">, HelpText<"Insert calls to fentry at function entry (x86 only)">,
	Flags<[CC1Option]>, Group<m_Group>;
	def mips16 : Flag<["-"], "mips16">, Group<m_Group>;
	def mno_mips16 : Flag<["-"], "mno-mips16">, Group<m_Group>;
	def mmicromips : Flag<["-"], "mmicromips">, Group<m_Group>;
	def mno_micromips : Flag<["-"], "mno-micromips">, Group<m_Group>;
	def mxgot : Flag<["-"], "mxgot">, Group<m_Group>;
	def mno_xgot : Flag<["-"], "mno-xgot">, Group<m_Group>;
	def mldc1_sdc1 : Flag<["-"], "mldc1-sdc1">, Group<m_Group>;
	def mno_ldc1_sdc1 : Flag<["-"], "mno-ldc1-sdc1">, Group<m_Group>;
	def mcheck_zero_division : Flag<["-"], "mcheck-zero-division">, Group<m_Group>;
	def mno_check_zero_division : Flag<["-"], "mno-check-zero-division">,
	Group<m_Group>;
	def mcompact_branches_EQ : Joined<["-"], "mcompact-branches=">, Group<m_Group>;
	def mbranch_likely : Flag<["-"], "mbranch-likely">, Group<m_Group>,
	IgnoredGCCCompat;
	def mno_branch_likely : Flag<["-"], "mno-branch-likely">, Group<m_Group>,
	IgnoredGCCCompat;
	def mdsp : Flag<["-"], "mdsp">, Group<m_Group>;
	def mno_dsp : Flag<["-"], "mno-dsp">, Group<m_Group>;
	def mdspr2 : Flag<["-"], "mdspr2">, Group<m_Group>;
	def mno_dspr2 : Flag<["-"], "mno-dspr2">, Group<m_Group>;
	def msingle_float : Flag<["-"], "msingle-float">, Group<m_Group>;
	def mdouble_float : Flag<["-"], "mdouble-float">, Group<m_Group>;
	def mmadd4 : Flag<["-"], "mmadd4">, Group<m_Group>,
	HelpText<"Enable the generation of 4-operand madd.s, madd.d and related instructions.">;
	def mno_madd4 : Flag<["-"], "mno-madd4">, Group<m_Group>,
	HelpText<"Disable the generation of 4-operand madd.s, madd.d and related instructions.">;
	def mmsa : Flag<["-"], "mmsa">, Group<m_Group>,
	HelpText<"Enable MSA ASE (MIPS only)">;
	def mno_msa : Flag<["-"], "mno-msa">, Group<m_Group>,
	HelpText<"Disable MSA ASE (MIPS only)">;
	def mmt : Flag<["-"], "mmt">, Group<m_Group>,
	HelpText<"Enable MT ASE (MIPS only)">;
	def mno_mt : Flag<["-"], "mno-mt">, Group<m_Group>,
	HelpText<"Disable MT ASE (MIPS only)">;
	def mfp64 : Flag<["-"], "mfp64">, Group<m_Group>,
	HelpText<"Use 64-bit floating point registers (MIPS only)">;
	def mfp32 : Flag<["-"], "mfp32">, Group<m_Group>,
	HelpText<"Use 32-bit floating point registers (MIPS only)">;
	def mgpopt : Flag<["-"], "mgpopt">, Group<m_Group>,
	HelpText<"Use GP relative accesses for symbols known to be in a small"
	" data section (MIPS)">;
	def mno_gpopt : Flag<["-"], "mno-gpopt">, Group<m_Group>,
	HelpText<"Do not use GP relative accesses for symbols known to be in a small"
	" data section (MIPS)">;
	def mlocal_sdata : Flag<["-"], "mlocal-sdata">, Group<m_Group>,
	HelpText<"Extend the -G behaviour to object local data (MIPS)">;
	def mno_local_sdata : Flag<["-"], "mno-local-sdata">, Group<m_Group>,
	HelpText<"Do not extend the -G behaviour to object local data (MIPS)">;
	def mextern_sdata : Flag<["-"], "mextern-sdata">, Group<m_Group>,
	HelpText<"Assume that externally defined data is in the small data if it"
	" meets the -G <size> threshold (MIPS)">;
	def mno_extern_sdata : Flag<["-"], "mno-extern-sdata">, Group<m_Group>,
	HelpText<"Do not assume that externally defined data is in the small data if"
	" it meets the -G <size> threshold (MIPS)">;
	def membedded_data : Flag<["-"], "membedded-data">, Group<m_Group>,
	HelpText<"Place constants in the .rodata section instead of the .sdata "
	"section even if they meet the -G <size> threshold (MIPS)">;
	def mno_embedded_data : Flag<["-"], "mno-embedded-data">, Group<m_Group>,
	HelpText<"Do not place constants in the .rodata section instead of the "
	".sdata if they meet the -G <size> threshold (MIPS)">;
	def mnan_EQ : Joined<["-"], "mnan=">, Group<m_Group>;
	def mabs_EQ : Joined<["-"], "mabs=">, Group<m_Group>;
	def mabicalls : Flag<["-"], "mabicalls">, Group<m_Group>,
	HelpText<"Enable SVR4-style position-independent code (Mips only)">;
	def mno_abicalls : Flag<["-"], "mno-abicalls">, Group<m_Group>,
	HelpText<"Disable SVR4-style position-independent code (Mips only)">;
	def mips1 : Flag<["-"], "mips1">,
	Alias<march_EQ>, AliasArgs<["mips1"]>,
	HelpText<"Equivalent to -march=mips1">, Flags<[HelpHidden]>;
	def mips2 : Flag<["-"], "mips2">,
	Alias<march_EQ>, AliasArgs<["mips2"]>,
	HelpText<"Equivalent to -march=mips2">, Flags<[HelpHidden]>;
	def mips3 : Flag<["-"], "mips3">,
	Alias<march_EQ>, AliasArgs<["mips3"]>,
	HelpText<"Equivalent to -march=mips3">, Flags<[HelpHidden]>;
	def mips4 : Flag<["-"], "mips4">,
	Alias<march_EQ>, AliasArgs<["mips4"]>,
	HelpText<"Equivalent to -march=mips4">, Flags<[HelpHidden]>;
	def mips5 : Flag<["-"], "mips5">,
	Alias<march_EQ>, AliasArgs<["mips5"]>,
	HelpText<"Equivalent to -march=mips5">, Flags<[HelpHidden]>;
	def mips32 : Flag<["-"], "mips32">,
	Alias<march_EQ>, AliasArgs<["mips32"]>,
	HelpText<"Equivalent to -march=mips32">, Flags<[HelpHidden]>;
	def mips32r2 : Flag<["-"], "mips32r2">,
	Alias<march_EQ>, AliasArgs<["mips32r2"]>,
	HelpText<"Equivalent to -march=mips32r2">, Flags<[HelpHidden]>;
	def mips32r3 : Flag<["-"], "mips32r3">,
	Alias<march_EQ>, AliasArgs<["mips32r3"]>,
	HelpText<"Equivalent to -march=mips32r3">, Flags<[HelpHidden]>;
	def mips32r5 : Flag<["-"], "mips32r5">,
	Alias<march_EQ>, AliasArgs<["mips32r5"]>,
	HelpText<"Equivalent to -march=mips32r5">, Flags<[HelpHidden]>;
	def mips32r6 : Flag<["-"], "mips32r6">,
	Alias<march_EQ>, AliasArgs<["mips32r6"]>,
	HelpText<"Equivalent to -march=mips32r6">, Flags<[HelpHidden]>;
	def mips64 : Flag<["-"], "mips64">,
	Alias<march_EQ>, AliasArgs<["mips64"]>,
	HelpText<"Equivalent to -march=mips64">, Flags<[HelpHidden]>;
	def mips64r2 : Flag<["-"], "mips64r2">,
	Alias<march_EQ>, AliasArgs<["mips64r2"]>,
	HelpText<"Equivalent to -march=mips64r2">, Flags<[HelpHidden]>;
	def mips64r3 : Flag<["-"], "mips64r3">,
	Alias<march_EQ>, AliasArgs<["mips64r3"]>,
	HelpText<"Equivalent to -march=mips64r3">, Flags<[HelpHidden]>;
	def mips64r5 : Flag<["-"], "mips64r5">,
	Alias<march_EQ>, AliasArgs<["mips64r5"]>,
	HelpText<"Equivalent to -march=mips64r5">, Flags<[HelpHidden]>;
	def mips64r6 : Flag<["-"], "mips64r6">,
	Alias<march_EQ>, AliasArgs<["mips64r6"]>,
	HelpText<"Equivalent to -march=mips64r6">, Flags<[HelpHidden]>;
	def mfpxx : Flag<["-"], "mfpxx">, Group<m_Group>,
	HelpText<"Avoid FPU mode dependent operations when used with the O32 ABI">,
	Flags<[HelpHidden]>;
	def modd_spreg : Flag<["-"], "modd-spreg">, Group<m_Group>,
	HelpText<"Enable odd single-precision floating point registers">,
	Flags<[HelpHidden]>;
	def mno_odd_spreg : Flag<["-"], "mno-odd-spreg">, Group<m_Group>,
	HelpText<"Disable odd single-precision floating point registers">,
	Flags<[HelpHidden]>;
	def mglibc : Flag<["-"], "mglibc">, Group<m_libc_Group>, Flags<[HelpHidden]>;
	def muclibc : Flag<["-"], "muclibc">, Group<m_libc_Group>, Flags<[HelpHidden]>;
	def module_file_info : Flag<["-"], "module-file-info">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
	HelpText<"Provide information about a particular module file">;
	def mthumb : Flag<["-"], "mthumb">, Group<m_Group>;
	def mtune_EQ : Joined<["-"], "mtune=">, Group<m_Group>;
	def multi__module : Flag<["-"], "multi_module">;
	def multiply__defined__unused : Separate<["-"], "multiply_defined_unused">;
	def multiply__defined : Separate<["-"], "multiply_defined">;
	def mwarn_nonportable_cfstrings : Flag<["-"], "mwarn-nonportable-cfstrings">, Group<m_Group>;
	def no_canonical_prefixes : Flag<["-"], "no-canonical-prefixes">, Flags<[HelpHidden]>,
	HelpText<"Use relative instead of canonical paths">;
	def no_cpp_precomp : Flag<["-"], "no-cpp-precomp">, Group<clang_ignored_f_Group>;
	def no_integrated_cpp : Flag<["-", "--"], "no-integrated-cpp">, Flags<[DriverOption]>;
	def no_pedantic : Flag<["-", "--"], "no-pedantic">, Group<pedantic_Group>;
	def no__dead__strip__inits__and__terms : Flag<["-"], "no_dead_strip_inits_and_terms">;
	def nobuiltininc : Flag<["-"], "nobuiltininc">, Flags<[CC1Option, CoreOption]>,
	HelpText<"Disable builtin #include directories">;
	def nocudainc : Flag<["-"], "nocudainc">;
	def nocudalib : Flag<["-"], "nocudalib">;
	def nodefaultlibs : Flag<["-"], "nodefaultlibs">;
	def nofixprebinding : Flag<["-"], "nofixprebinding">;
	def nolibc : Flag<["-"], "nolibc">;
	def nomultidefs : Flag<["-"], "nomultidefs">;
	def nopie : Flag<["-"], "nopie">;
	def no_pie : Flag<["-"], "no-pie">, Alias<nopie>;
	def noprebind : Flag<["-"], "noprebind">;
	def noseglinkedit : Flag<["-"], "noseglinkedit">;
	def nostartfiles : Flag<["-"], "nostartfiles">;
	def nostdinc : Flag<["-"], "nostdinc">, Flags<[CoreOption]>;
	def nostdlibinc : Flag<["-"], "nostdlibinc">;
	def nostdincxx : Flag<["-"], "nostdinc++">, Flags<[CC1Option]>,
	HelpText<"Disable standard #include directories for the C++ standard library">;
	def nostdlib : Flag<["-"], "nostdlib">;
	def nostdlibxx : Flag<["-"], "nostdlib++">;
	def object : Flag<["-"], "object">;
	def o : JoinedOrSeparate<["-"], "o">, Flags<[DriverOption, RenderAsInput, CC1Option, CC1AsOption]>,
	HelpText<"Write output to <file>">, MetaVarName<"<file>">;
	def pagezero__size : JoinedOrSeparate<["-"], "pagezero_size">;
	def pass_exit_codes : Flag<["-", "--"], "pass-exit-codes">, Flags<[Unsupported]>;
	def pedantic_errors : Flag<["-", "--"], "pedantic-errors">, Group<pedantic_Group>, Flags<[CC1Option]>;
	def pedantic : Flag<["-", "--"], "pedantic">, Group<pedantic_Group>, Flags<[CC1Option]>;
	def pg : Flag<["-"], "pg">, HelpText<"Enable mcount instrumentation">, Flags<[CC1Option]>;
	def pipe : Flag<["-", "--"], "pipe">,
	HelpText<"Use pipes between commands, when possible">;
	def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">;
	def prebind : Flag<["-"], "prebind">;
	def preload : Flag<["-"], "preload">;
	def print_file_name_EQ : Joined<["-", "--"], "print-file-name=">,
	HelpText<"Print the full library path of <file>">, MetaVarName<"<file>">;
	def print_ivar_layout : Flag<["-"], "print-ivar-layout">, Flags<[CC1Option]>,
	HelpText<"Enable Objective-C Ivar layout bitmap print trace">;
	def print_libgcc_file_name : Flag<["-", "--"], "print-libgcc-file-name">,
	HelpText<"Print the library path for the currently used compiler runtime "
	"library (\"libgcc.a\" or \"libclang_rt.builtins.*.a\")">;
	def print_multi_directory : Flag<["-", "--"], "print-multi-directory">;
	def print_multi_lib : Flag<["-", "--"], "print-multi-lib">;
	def print_multi_os_directory : Flag<["-", "--"], "print-multi-os-directory">,
	Flags<[Unsupported]>;
	def print_prog_name_EQ : Joined<["-", "--"], "print-prog-name=">,
	HelpText<"Print the full program path of <name>">, MetaVarName<"<name>">;
	def print_resource_dir : Flag<["-", "--"], "print-resource-dir">,
	HelpText<"Print the resource directory pathname">;
	def print_search_dirs : Flag<["-", "--"], "print-search-dirs">,
	HelpText<"Print the paths used for finding libraries and programs">;
	def private__bundle : Flag<["-"], "private_bundle">;
	def pthreads : Flag<["-"], "pthreads">;
	def pthread : Flag<["-"], "pthread">, Flags<[CC1Option]>,
	HelpText<"Support POSIX threads in generated code">;
	def no_pthread : Flag<["-"], "no-pthread">, Flags<[CC1Option]>;
	def p : Flag<["-"], "p">;
	def pie : Flag<["-"], "pie">;
	def read__only__relocs : Separate<["-"], "read_only_relocs">;
	def remap : Flag<["-"], "remap">;
	def rewrite_objc : Flag<["-"], "rewrite-objc">, Flags<[DriverOption,CC1Option]>,
	HelpText<"Rewrite Objective-C source to C++">, Group<Action_Group>;
	def rewrite_legacy_objc : Flag<["-"], "rewrite-legacy-objc">, Flags<[DriverOption]>,
	HelpText<"Rewrite Legacy Objective-C source to C++">;
	def rdynamic : Flag<["-"], "rdynamic">;
	def resource_dir : Separate<["-"], "resource-dir">,
	Flags<[DriverOption, CC1Option, CoreOption, HelpHidden]>,
	HelpText<"The directory which holds the compiler resource files">;
	def resource_dir_EQ : Joined<["-"], "resource-dir=">, Flags<[DriverOption, CoreOption]>,
	Alias<resource_dir>;
	def rpath : Separate<["-"], "rpath">, Flags<[LinkerInput]>, Group<Link_Group>;
	def rtlib_EQ : Joined<["-", "--"], "rtlib=">,
	HelpText<"Compiler runtime library to use">;
	def frtlib_add_rpath: Flag<["-"], "frtlib-add-rpath">, Flags<[NoArgumentUnused]>,
	HelpText<"Add -rpath with architecture-specific resource directory to the linker flags">;
	def fno_rtlib_add_rpath: Flag<["-"], "fno-rtlib-add-rpath">, Flags<[NoArgumentUnused]>,
	HelpText<"Do not add -rpath with architecture-specific resource directory to the linker flags">;
	def r : Flag<["-"], "r">, Flags<[LinkerInput,NoArgumentUnused]>,
	Group<Link_Group>;
	def save_temps_EQ : Joined<["-", "--"], "save-temps=">, Flags<[DriverOption]>,
	HelpText<"Save intermediate compilation results.">;
	def save_temps : Flag<["-", "--"], "save-temps">, Flags<[DriverOption]>,
	Alias<save_temps_EQ>, AliasArgs<["cwd"]>,
	HelpText<"Save intermediate compilation results">;
	def save_stats_EQ : Joined<["-", "--"], "save-stats=">, Flags<[DriverOption]>,
	HelpText<"Save llvm statistics.">;
	def save_stats : Flag<["-", "--"], "save-stats">, Flags<[DriverOption]>,
	Alias<save_stats_EQ>, AliasArgs<["cwd"]>,
	HelpText<"Save llvm statistics.">;
	def via_file_asm : Flag<["-", "--"], "via-file-asm">, InternalDebugOpt,
	HelpText<"Write assembly to file for input to assemble jobs">;
	def sectalign : MultiArg<["-"], "sectalign", 3>;
	def sectcreate : MultiArg<["-"], "sectcreate", 3>;
	def sectobjectsymbols : MultiArg<["-"], "sectobjectsymbols", 2>;
	def sectorder : MultiArg<["-"], "sectorder", 3>;
	def seg1addr : JoinedOrSeparate<["-"], "seg1addr">;
	def seg__addr__table__filename : Separate<["-"], "seg_addr_table_filename">;
	def seg__addr__table : Separate<["-"], "seg_addr_table">;
	def segaddr : MultiArg<["-"], "segaddr", 2>;
	def segcreate : MultiArg<["-"], "segcreate", 3>;
	def seglinkedit : Flag<["-"], "seglinkedit">;
	def segprot : MultiArg<["-"], "segprot", 3>;
	def segs__read__only__addr : Separate<["-"], "segs_read_only_addr">;
	def segs__read__write__addr : Separate<["-"], "segs_read_write_addr">;
	def segs__read__ : Joined<["-"], "segs_read_">;
	def shared_libgcc : Flag<["-"], "shared-libgcc">;
	def shared : Flag<["-", "--"], "shared">;
	def single__module : Flag<["-"], "single_module">;
	def specs_EQ : Joined<["-", "--"], "specs=">;
	def specs : Separate<["-", "--"], "specs">, Flags<[Unsupported]>;
	def static_libgcc : Flag<["-"], "static-libgcc">;
	def static_libstdcxx : Flag<["-"], "static-libstdc++">;
	def static : Flag<["-", "--"], "static">, Flags<[NoArgumentUnused]>;
	def std_default_EQ : Joined<["-"], "std-default=">;
	def std_EQ : Joined<["-", "--"], "std=">, Flags<[CC1Option]>,
	Group<CompileOnly_Group>, HelpText<"Language standard to compile for">,
	ValuesCode<[{
	const char *Values =
	#define LANGSTANDARD(id, name, lang, desc, features) name ","
	#define LANGSTANDARD_ALIAS(id, alias) alias ","
	#include "clang/Frontend/LangStandards.def"
	;
	}]>;
	def stdlib_EQ : Joined<["-", "--"], "stdlib=">, Flags<[CC1Option]>,
	HelpText<"C++ standard library to use">, Values<"libc++,libstdc++,platform">;
	def sub__library : JoinedOrSeparate<["-"], "sub_library">;
	def sub__umbrella : JoinedOrSeparate<["-"], "sub_umbrella">;
	def system_header_prefix : Joined<["--"], "system-header-prefix=">,
	Group<clang_i_Group>, Flags<[CC1Option]>, MetaVarName<"<prefix>">,
	HelpText<"Treat all #include paths starting with <prefix> as including a "
	"system header.">;
	def : Separate<["--"], "system-header-prefix">, Alias<system_header_prefix>;
	def no_system_header_prefix : Joined<["--"], "no-system-header-prefix=">,
	Group<clang_i_Group>, Flags<[CC1Option]>, MetaVarName<"<prefix>">,
	HelpText<"Treat all #include paths starting with <prefix> as not including a "
	"system header.">;
	def : Separate<["--"], "no-system-header-prefix">, Alias<no_system_header_prefix>;
	def s : Flag<["-"], "s">, Group<Link_Group>;
	def target : Joined<["--"], "target=">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Generate code for the given target">;
	def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[DriverOption]>,
	HelpText<"Use the gcc toolchain at the given directory">;
	def time : Flag<["-"], "time">,
	HelpText<"Time individual commands">;
	def traditional_cpp : Flag<["-", "--"], "traditional-cpp">, Flags<[CC1Option]>,
	HelpText<"Enable some traditional CPP emulation">;
	def traditional : Flag<["-", "--"], "traditional">;
	def trigraphs : Flag<["-", "--"], "trigraphs">, Alias<ftrigraphs>,
	HelpText<"Process trigraph sequences">;
	def twolevel__namespace__hints : Flag<["-"], "twolevel_namespace_hints">;
	def twolevel__namespace : Flag<["-"], "twolevel_namespace">;
	def t : Flag<["-"], "t">, Group<Link_Group>;
	def umbrella : Separate<["-"], "umbrella">;
	def undefined : JoinedOrSeparate<["-"], "undefined">, Group<u_Group>;
	def undef : Flag<["-"], "undef">, Group<u_Group>, Flags<[CC1Option]>,
	HelpText<"undef all system defines">;
	def unexported__symbols__list : Separate<["-"], "unexported_symbols_list">;
	def u : JoinedOrSeparate<["-"], "u">, Group<u_Group>;
	def v : Flag<["-"], "v">, Flags<[CC1Option, CoreOption]>,
	HelpText<"Show commands to run and use verbose output">;
	def verify_debug_info : Flag<["--"], "verify-debug-info">, Flags<[DriverOption]>,
	HelpText<"Verify the binary representation of debug output">;
	def weak_l : Joined<["-"], "weak-l">, Flags<[LinkerInput]>;
	def weak__framework : Separate<["-"], "weak_framework">, Flags<[LinkerInput]>;
	def weak__library : Separate<["-"], "weak_library">, Flags<[LinkerInput]>;
	def weak__reference__mismatches : Separate<["-"], "weak_reference_mismatches">;
	def whatsloaded : Flag<["-"], "whatsloaded">;
	def whyload : Flag<["-"], "whyload">;
	def w : Flag<["-"], "w">, HelpText<"Suppress all warnings">, Flags<[CC1Option]>;
	def x : JoinedOrSeparate<["-"], "x">, Flags<[DriverOption,CC1Option]>,
	HelpText<"Treat subsequent input files as having type <language>">,
	MetaVarName<"<language>">;
	def y : Joined<["-"], "y">;

	def fintegrated_as : Flag<["-"], "fintegrated-as">, Flags<[DriverOption]>,
	Group<f_Group>, HelpText<"Enable the integrated assembler">;
	def fno_integrated_as : Flag<["-"], "fno-integrated-as">,
	Flags<[CC1Option, DriverOption]>, Group<f_Group>,
	HelpText<"Disable the integrated assembler">;
	def : Flag<["-"], "integrated-as">, Alias<fintegrated_as>, Flags<[DriverOption]>;
	def : Flag<["-"], "no-integrated-as">, Alias<fno_integrated_as>,
	Flags<[CC1Option, DriverOption]>;

	def working_directory : JoinedOrSeparate<["-"], "working-directory">, Flags<[CC1Option]>,
	HelpText<"Resolve file paths relative to the specified directory">;
	def working_directory_EQ : Joined<["-"], "working-directory=">, Flags<[CC1Option]>,
	Alias<working_directory>;

	// Double dash options, which are usually an alias for one of the previous
	// options.

	def _mhwdiv_EQ : Joined<["--"], "mhwdiv=">, Alias<mhwdiv_EQ>;
	def _mhwdiv : Separate<["--"], "mhwdiv">, Alias<mhwdiv_EQ>;
	def _CLASSPATH_EQ : Joined<["--"], "CLASSPATH=">, Alias<fclasspath_EQ>;
	def _CLASSPATH : Separate<["--"], "CLASSPATH">, Alias<fclasspath_EQ>;
	def _all_warnings : Flag<["--"], "all-warnings">, Alias<Wall>;
	def _analyze_auto : Flag<["--"], "analyze-auto">, Flags<[DriverOption]>;
	def _analyzer_no_default_checks : Flag<["--"], "analyzer-no-default-checks">, Flags<[DriverOption]>;
	def _analyzer_output : JoinedOrSeparate<["--"], "analyzer-output">, Flags<[DriverOption]>,
	HelpText<"Static analyzer report output format (html\|plist\|plist-multi-file\|plist-html\|text).">;
	def _analyze : Flag<["--"], "analyze">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Run the static analyzer">;
	def _assemble : Flag<["--"], "assemble">, Alias<S>;
	def _assert_EQ : Joined<["--"], "assert=">, Alias<A>;
	def _assert : Separate<["--"], "assert">, Alias<A>;
	def _bootclasspath_EQ : Joined<["--"], "bootclasspath=">, Alias<fbootclasspath_EQ>;
	def _bootclasspath : Separate<["--"], "bootclasspath">, Alias<fbootclasspath_EQ>;
	def _classpath_EQ : Joined<["--"], "classpath=">, Alias<fclasspath_EQ>;
	def _classpath : Separate<["--"], "classpath">, Alias<fclasspath_EQ>;
	def _comments_in_macros : Flag<["--"], "comments-in-macros">, Alias<CC>;
	def _comments : Flag<["--"], "comments">, Alias<C>;
	def _compile : Flag<["--"], "compile">, Alias<c>;
	def _constant_cfstrings : Flag<["--"], "constant-cfstrings">;
	def _debug_EQ : Joined<["--"], "debug=">, Alias<g_Flag>;
	def _debug : Flag<["--"], "debug">, Alias<g_Flag>;
	def _define_macro_EQ : Joined<["--"], "define-macro=">, Alias<D>;
	def _define_macro : Separate<["--"], "define-macro">, Alias<D>;
	def _dependencies : Flag<["--"], "dependencies">, Alias<M>;
	def _dyld_prefix_EQ : Joined<["--"], "dyld-prefix=">;
	def _dyld_prefix : Separate<["--"], "dyld-prefix">, Alias<_dyld_prefix_EQ>;
	def _encoding_EQ : Joined<["--"], "encoding=">, Alias<fencoding_EQ>;
	def _encoding : Separate<["--"], "encoding">, Alias<fencoding_EQ>;
	def _entry : Flag<["--"], "entry">, Alias<e>;
	def _extdirs_EQ : Joined<["--"], "extdirs=">, Alias<fextdirs_EQ>;
	def _extdirs : Separate<["--"], "extdirs">, Alias<fextdirs_EQ>;
	def _extra_warnings : Flag<["--"], "extra-warnings">, Alias<W_Joined>;
	def _for_linker_EQ : Joined<["--"], "for-linker=">, Alias<Xlinker>;
	def _for_linker : Separate<["--"], "for-linker">, Alias<Xlinker>;
	def _force_link_EQ : Joined<["--"], "force-link=">, Alias<u>;
	def _force_link : Separate<["--"], "force-link">, Alias<u>;
	def _help_hidden : Flag<["--"], "help-hidden">;
	def _imacros_EQ : Joined<["--"], "imacros=">, Alias<imacros>;
	def _include_barrier : Flag<["--"], "include-barrier">, Alias<I_>;
	def _include_directory_after_EQ : Joined<["--"], "include-directory-after=">, Alias<idirafter>;
	def _include_directory_after : Separate<["--"], "include-directory-after">, Alias<idirafter>;
	def _include_directory_EQ : Joined<["--"], "include-directory=">, Alias<I>;
	def _include_directory : Separate<["--"], "include-directory">, Alias<I>;
	def _include_prefix_EQ : Joined<["--"], "include-prefix=">, Alias<iprefix>;
	def _include_prefix : Separate<["--"], "include-prefix">, Alias<iprefix>;
	def _include_with_prefix_after_EQ : Joined<["--"], "include-with-prefix-after=">, Alias<iwithprefix>;
	def _include_with_prefix_after : Separate<["--"], "include-with-prefix-after">, Alias<iwithprefix>;
	def _include_with_prefix_before_EQ : Joined<["--"], "include-with-prefix-before=">, Alias<iwithprefixbefore>;
	def _include_with_prefix_before : Separate<["--"], "include-with-prefix-before">, Alias<iwithprefixbefore>;
	def _include_with_prefix_EQ : Joined<["--"], "include-with-prefix=">, Alias<iwithprefix>;
	def _include_with_prefix : Separate<["--"], "include-with-prefix">, Alias<iwithprefix>;
	def _include_EQ : Joined<["--"], "include=">, Alias<include_>;
	def _language_EQ : Joined<["--"], "language=">, Alias<x>;
	def _language : Separate<["--"], "language">, Alias<x>;
	def _library_directory_EQ : Joined<["--"], "library-directory=">, Alias<L>;
	def _library_directory : Separate<["--"], "library-directory">, Alias<L>;
	def _no_line_commands : Flag<["--"], "no-line-commands">, Alias<P>;
	def _no_standard_includes : Flag<["--"], "no-standard-includes">, Alias<nostdinc>;
	def _no_standard_libraries : Flag<["--"], "no-standard-libraries">, Alias<nostdlib>;
	def _no_undefined : Flag<["--"], "no-undefined">, Flags<[LinkerInput]>;
	def _no_warnings : Flag<["--"], "no-warnings">, Alias<w>;
	def _optimize_EQ : Joined<["--"], "optimize=">, Alias<O>;
	def _optimize : Flag<["--"], "optimize">, Alias<O>;
	def _output_class_directory_EQ : Joined<["--"], "output-class-directory=">, Alias<foutput_class_dir_EQ>;
	def _output_class_directory : Separate<["--"], "output-class-directory">, Alias<foutput_class_dir_EQ>;
	def _output_EQ : Joined<["--"], "output=">, Alias<o>;
	def _output : Separate<["--"], "output">, Alias<o>;
	def _param : Separate<["--"], "param">, Group<CompileOnly_Group>;
	def _param_EQ : Joined<["--"], "param=">, Alias<_param>;
	def _precompile : Flag<["--"], "precompile">, Flags<[DriverOption]>,
	Group<Action_Group>, HelpText<"Only precompile the input">;
	def _prefix_EQ : Joined<["--"], "prefix=">, Alias<B>;
	def _prefix : Separate<["--"], "prefix">, Alias<B>;
	def _preprocess : Flag<["--"], "preprocess">, Alias<E>;
	def _print_diagnostic_categories : Flag<["--"], "print-diagnostic-categories">;
	def _print_file_name : Separate<["--"], "print-file-name">, Alias<print_file_name_EQ>;
	def _print_missing_file_dependencies : Flag<["--"], "print-missing-file-dependencies">, Alias<MG>;
	def _print_prog_name : Separate<["--"], "print-prog-name">, Alias<print_prog_name_EQ>;
	def _profile_blocks : Flag<["--"], "profile-blocks">, Alias<a>;
	def _profile : Flag<["--"], "profile">, Alias<p>;
	def _resource_EQ : Joined<["--"], "resource=">, Alias<fcompile_resource_EQ>;
	def _resource : Separate<["--"], "resource">, Alias<fcompile_resource_EQ>;
	def _rtlib : Separate<["--"], "rtlib">, Alias<rtlib_EQ>;
	def _serialize_diags : Separate<["-", "--"], "serialize-diagnostics">, Flags<[DriverOption]>,
	HelpText<"Serialize compiler diagnostics to a file">;
	// We give --version different semantics from -version.
	def _version : Flag<["--"], "version">, Flags<[CoreOption, CC1Option]>,
	HelpText<"Print version information">;
	def _signed_char : Flag<["--"], "signed-char">, Alias<fsigned_char>;
	def _std : Separate<["--"], "std">, Alias<std_EQ>;
	def _stdlib : Separate<["--"], "stdlib">, Alias<stdlib_EQ>;
	def _sysroot_EQ : Joined<["--"], "sysroot=">;
	def _sysroot : Separate<["--"], "sysroot">, Alias<_sysroot_EQ>;
	def _target_help : Flag<["--"], "target-help">;
	def _trace_includes : Flag<["--"], "trace-includes">, Alias<H>;
	def _undefine_macro_EQ : Joined<["--"], "undefine-macro=">, Alias<U>;
	def _undefine_macro : Separate<["--"], "undefine-macro">, Alias<U>;
	def _unsigned_char : Flag<["--"], "unsigned-char">, Alias<funsigned_char>;
	def _user_dependencies : Flag<["--"], "user-dependencies">, Alias<MM>;
	def _verbose : Flag<["--"], "verbose">, Alias<v>;
	def _warn__EQ : Joined<["--"], "warn-=">, Alias<W_Joined>;
	def _warn_ : Joined<["--"], "warn-">, Alias<W_Joined>;
	def _write_dependencies : Flag<["--"], "write-dependencies">, Alias<MD>;
	def _write_user_dependencies : Flag<["--"], "write-user-dependencies">, Alias<MMD>;
	def _ : Joined<["--"], "">, Flags<[Unsupported]>;

	def mieee_rnd_near : Flag<["-"], "mieee-rnd-near">, Group<m_hexagon_Features_Group>;
	def mv4 : Flag<["-"], "mv4">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv4"]>;
	def mv5 : Flag<["-"], "mv5">, Group<m_hexagon_Features_Group>, Alias<mcpu_EQ>,
	AliasArgs<["hexagonv5"]>;
	def mv55 : Flag<["-"], "mv55">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv55"]>;
	def mv60 : Flag<["-"], "mv60">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv60"]>;
	def mv62 : Flag<["-"], "mv62">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv62"]>;
	def mv65 : Flag<["-"], "mv65">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv65"]>;
	def mhexagon_hvx : Flag<[ "-" ], "mhvx">,
	Group<m_hexagon_Features_HVX_Group>,
	HelpText<"Enable Hexagon Vector eXtensions">;
	def mhexagon_hvx_EQ : Joined<[ "-" ], "mhvx=">,
	Group<m_hexagon_Features_HVX_Group>,
	HelpText<"Enable Hexagon Vector eXtensions">;
	def mno_hexagon_hvx : Flag<[ "-" ], "mno-hvx">,
	Group<m_hexagon_Features_HVX_Group>,
	HelpText<"Disable Hexagon Vector eXtensions">;
	def mhexagon_hvx_length_EQ : Joined<[ "-" ], "mhvx-length=">,
	Group<m_hexagon_Features_HVX_Group>,
	HelpText<"Set Hexagon Vector Length">, Values<"64B,128B">;
	// hvx-double deprecrated flag.
	def mhexagon_hvx_double : Flag<[ "-" ], "mhvx-double">,
	Group<m_hexagon_Features_HVX_Group>,
	HelpText<"Enable Hexagon Double Vector eXtensions">;
	def mno_hexagon_hvx_double
	: Flag<[ "-" ], "mno-hvx-double">,
	Group<m_hexagon_Features_HVX_Group>,
	HelpText<"Disable Hexagon Double Vector eXtensions">;


	// X86 feature flags
	def mx87 : Flag<["-"], "mx87">, Group<m_x86_Features_Group>;
	def mno_x87 : Flag<["-"], "mno-x87">, Group<m_x86_Features_Group>;
	def m80387 : Flag<["-"], "m80387">, Alias<mx87>;
	def mno_80387 : Flag<["-"], "mno-80387">, Alias<mno_x87>;
	def mmmx : Flag<["-"], "mmmx">, Group<m_x86_Features_Group>;
	def mno_mmx : Flag<["-"], "mno-mmx">, Group<m_x86_Features_Group>;
	def m3dnow : Flag<["-"], "m3dnow">, Group<m_x86_Features_Group>;
	def mno_3dnow : Flag<["-"], "mno-3dnow">, Group<m_x86_Features_Group>;
	def m3dnowa : Flag<["-"], "m3dnowa">, Group<m_x86_Features_Group>;
	def mno_3dnowa : Flag<["-"], "mno-3dnowa">, Group<m_x86_Features_Group>;
	def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
	def mno_sse : Flag<["-"], "mno-sse">, Group<m_x86_Features_Group>;
	def msse2 : Flag<["-"], "msse2">, Group<m_x86_Features_Group>;
	def mno_sse2 : Flag<["-"], "mno-sse2">, Group<m_x86_Features_Group>;
	def msse3 : Flag<["-"], "msse3">, Group<m_x86_Features_Group>;
	def mno_sse3 : Flag<["-"], "mno-sse3">, Group<m_x86_Features_Group>;
	def mssse3 : Flag<["-"], "mssse3">, Group<m_x86_Features_Group>;
	def mno_ssse3 : Flag<["-"], "mno-ssse3">, Group<m_x86_Features_Group>;
	def msse4_1 : Flag<["-"], "msse4.1">, Group<m_x86_Features_Group>;
	def mno_sse4_1 : Flag<["-"], "mno-sse4.1">, Group<m_x86_Features_Group>;
	def msse4_2 : Flag<["-"], "msse4.2">, Group<m_x86_Features_Group>;
	def mno_sse4_2 : Flag<["-"], "mno-sse4.2">, Group<m_x86_Features_Group>;
	def msse4 : Flag<["-"], "msse4">, Alias<msse4_2>;
	// -mno-sse4 turns off sse4.1 which has the effect of turning off everything
	// later than 4.1. -msse4 turns on 4.2 which has the effect of turning on
	// everything earlier than 4.2.
	def mno_sse4 : Flag<["-"], "mno-sse4">, Alias<mno_sse4_1>;
	def msse4a : Flag<["-"], "msse4a">, Group<m_x86_Features_Group>;
	def mno_sse4a : Flag<["-"], "mno-sse4a">, Group<m_x86_Features_Group>;
	def mavx : Flag<["-"], "mavx">, Group<m_x86_Features_Group>;
	def mno_avx : Flag<["-"], "mno-avx">, Group<m_x86_Features_Group>;
	def mavx2 : Flag<["-"], "mavx2">, Group<m_x86_Features_Group>;
	def mno_avx2 : Flag<["-"], "mno-avx2">, Group<m_x86_Features_Group>;
	def mavx512f : Flag<["-"], "mavx512f">, Group<m_x86_Features_Group>;
	def mno_avx512f : Flag<["-"], "mno-avx512f">, Group<m_x86_Features_Group>;
	def mavx512bitalg : Flag<["-"], "mavx512bitalg">, Group<m_x86_Features_Group>;
	def mno_avx512bitalg : Flag<["-"], "mno-avx512bitalg">, Group<m_x86_Features_Group>;
	def mavx512bw : Flag<["-"], "mavx512bw">, Group<m_x86_Features_Group>;
	def mno_avx512bw : Flag<["-"], "mno-avx512bw">, Group<m_x86_Features_Group>;
	def mavx512cd : Flag<["-"], "mavx512cd">, Group<m_x86_Features_Group>;
	def mno_avx512cd : Flag<["-"], "mno-avx512cd">, Group<m_x86_Features_Group>;
	def mavx512dq : Flag<["-"], "mavx512dq">, Group<m_x86_Features_Group>;
	def mno_avx512dq : Flag<["-"], "mno-avx512dq">, Group<m_x86_Features_Group>;
	def mavx512er : Flag<["-"], "mavx512er">, Group<m_x86_Features_Group>;
	def mno_avx512er : Flag<["-"], "mno-avx512er">, Group<m_x86_Features_Group>;
	def mavx512ifma : Flag<["-"], "mavx512ifma">, Group<m_x86_Features_Group>;
	def mno_avx512ifma : Flag<["-"], "mno-avx512ifma">, Group<m_x86_Features_Group>;
	def mavx512pf : Flag<["-"], "mavx512pf">, Group<m_x86_Features_Group>;
	def mno_avx512pf : Flag<["-"], "mno-avx512pf">, Group<m_x86_Features_Group>;
	def mavx512vbmi : Flag<["-"], "mavx512vbmi">, Group<m_x86_Features_Group>;
	def mno_avx512vbmi : Flag<["-"], "mno-avx512vbmi">, Group<m_x86_Features_Group>;
	def mavx512vbmi2 : Flag<["-"], "mavx512vbmi2">, Group<m_x86_Features_Group>;
	def mno_avx512vbmi2 : Flag<["-"], "mno-avx512vbmi2">, Group<m_x86_Features_Group>;
	def mavx512vl : Flag<["-"], "mavx512vl">, Group<m_x86_Features_Group>;
	def mno_avx512vl : Flag<["-"], "mno-avx512vl">, Group<m_x86_Features_Group>;
	def mavx512vnni : Flag<["-"], "mavx512vnni">, Group<m_x86_Features_Group>;
	def mno_avx512vnni : Flag<["-"], "mno-avx512vnni">, Group<m_x86_Features_Group>;
	def mavx512vpopcntdq : Flag<["-"], "mavx512vpopcntdq">, Group<m_x86_Features_Group>;
	def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group<m_x86_Features_Group>;
	def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
	def mno_adx : Flag<["-"], "mno-adx">, Group<m_x86_Features_Group>;
	def maes : Flag<["-"], "maes">, Group<m_x86_Features_Group>;
	def mno_aes : Flag<["-"], "mno-aes">, Group<m_x86_Features_Group>;
	def mbmi : Flag<["-"], "mbmi">, Group<m_x86_Features_Group>;
	def mno_bmi : Flag<["-"], "mno-bmi">, Group<m_x86_Features_Group>;
	def mbmi2 : Flag<["-"], "mbmi2">, Group<m_x86_Features_Group>;
	def mno_bmi2 : Flag<["-"], "mno-bmi2">, Group<m_x86_Features_Group>;
	def mclflushopt : Flag<["-"], "mclflushopt">, Group<m_x86_Features_Group>;
	def mno_clflushopt : Flag<["-"], "mno-clflushopt">, Group<m_x86_Features_Group>;
	def mclwb : Flag<["-"], "mclwb">, Group<m_x86_Features_Group>;
	def mno_clwb : Flag<["-"], "mno-clwb">, Group<m_x86_Features_Group>;
	def mclzero : Flag<["-"], "mclzero">, Group<m_x86_Features_Group>;
	def mno_clzero : Flag<["-"], "mno-clzero">, Group<m_x86_Features_Group>;
	def mcx16 : Flag<["-"], "mcx16">, Group<m_x86_Features_Group>;
	def mno_cx16 : Flag<["-"], "mno-cx16">, Group<m_x86_Features_Group>;
	def mf16c : Flag<["-"], "mf16c">, Group<m_x86_Features_Group>;
	def mno_f16c : Flag<["-"], "mno-f16c">, Group<m_x86_Features_Group>;
	def mfma : Flag<["-"], "mfma">, Group<m_x86_Features_Group>;
	def mno_fma : Flag<["-"], "mno-fma">, Group<m_x86_Features_Group>;
	def mfma4 : Flag<["-"], "mfma4">, Group<m_x86_Features_Group>;
	def mno_fma4 : Flag<["-"], "mno-fma4">, Group<m_x86_Features_Group>;
	def mfsgsbase : Flag<["-"], "mfsgsbase">, Group<m_x86_Features_Group>;
	def mno_fsgsbase : Flag<["-"], "mno-fsgsbase">, Group<m_x86_Features_Group>;
	def mfxsr : Flag<["-"], "mfxsr">, Group<m_x86_Features_Group>;
	def mno_fxsr : Flag<["-"], "mno-fxsr">, Group<m_x86_Features_Group>;
	def mgfni : Flag<["-"], "mgfni">, Group<m_x86_Features_Group>;
	def mno_gfni : Flag<["-"], "mno-gfni">, Group<m_x86_Features_Group>;
	def mlwp : Flag<["-"], "mlwp">, Group<m_x86_Features_Group>;
	def mno_lwp : Flag<["-"], "mno-lwp">, Group<m_x86_Features_Group>;
	def mlzcnt : Flag<["-"], "mlzcnt">, Group<m_x86_Features_Group>;
	def mno_lzcnt : Flag<["-"], "mno-lzcnt">, Group<m_x86_Features_Group>;
	def mmovbe : Flag<["-"], "mmovbe">, Group<m_x86_Features_Group>;
	def mno_movbe : Flag<["-"], "mno-movbe">, Group<m_x86_Features_Group>;
	def mmpx : Flag<["-"], "mmpx">, Group<m_x86_Features_Group>;
	def mno_mpx : Flag<["-"], "mno-mpx">, Group<m_x86_Features_Group>;
	def mmwaitx : Flag<["-"], "mmwaitx">, Group<m_x86_Features_Group>;
	def mno_mwaitx : Flag<["-"], "mno-mwaitx">, Group<m_x86_Features_Group>;
	def mpku : Flag<["-"], "mpku">, Group<m_x86_Features_Group>;
	def mno_pku : Flag<["-"], "mno-pku">, Group<m_x86_Features_Group>;
	def mpclmul : Flag<["-"], "mpclmul">, Group<m_x86_Features_Group>;
	def mno_pclmul : Flag<["-"], "mno-pclmul">, Group<m_x86_Features_Group>;
	def mpopcnt : Flag<["-"], "mpopcnt">, Group<m_x86_Features_Group>;
	def mno_popcnt : Flag<["-"], "mno-popcnt">, Group<m_x86_Features_Group>;
	def mprefetchwt1 : Flag<["-"], "mprefetchwt1">, Group<m_x86_Features_Group>;
	def mno_prefetchwt1 : Flag<["-"], "mno-prefetchwt1">, Group<m_x86_Features_Group>;
	def mprfchw : Flag<["-"], "mprfchw">, Group<m_x86_Features_Group>;
	def mno_prfchw : Flag<["-"], "mno-prfchw">, Group<m_x86_Features_Group>;
	def mrdrnd : Flag<["-"], "mrdrnd">, Group<m_x86_Features_Group>;
	def mno_rdrnd : Flag<["-"], "mno-rdrnd">, Group<m_x86_Features_Group>;
	def mrtm : Flag<["-"], "mrtm">, Group<m_x86_Features_Group>;
	def mno_rtm : Flag<["-"], "mno-rtm">, Group<m_x86_Features_Group>;
	def mrdseed : Flag<["-"], "mrdseed">, Group<m_x86_Features_Group>;
	def mno_rdseed : Flag<["-"], "mno-rdseed">, Group<m_x86_Features_Group>;
	def msgx : Flag<["-"], "msgx">, Group<m_x86_Features_Group>;
	def mno_sgx : Flag<["-"], "mno-sgx">, Group<m_x86_Features_Group>;
	def msha : Flag<["-"], "msha">, Group<m_x86_Features_Group>;
	def mno_sha : Flag<["-"], "mno-sha">, Group<m_x86_Features_Group>;
	def mtbm : Flag<["-"], "mtbm">, Group<m_x86_Features_Group>;
	def mno_tbm : Flag<["-"], "mno-tbm">, Group<m_x86_Features_Group>;
	def mvaes : Flag<["-"], "mvaes">, Group<m_x86_Features_Group>;
	def mno_vaes : Flag<["-"], "mno-vaes">, Group<m_x86_Features_Group>;
	def mvpclmulqdq : Flag<["-"], "mvpclmulqdq">, Group<m_x86_Features_Group>;
	def mno_vpclmulqdq : Flag<["-"], "mno-vpclmulqdq">, Group<m_x86_Features_Group>;
	def mxop : Flag<["-"], "mxop">, Group<m_x86_Features_Group>;
	def mno_xop : Flag<["-"], "mno-xop">, Group<m_x86_Features_Group>;
	def mxsave : Flag<["-"], "mxsave">, Group<m_x86_Features_Group>;
	def mno_xsave : Flag<["-"], "mno-xsave">, Group<m_x86_Features_Group>;
	def mxsavec : Flag<["-"], "mxsavec">, Group<m_x86_Features_Group>;
	def mno_xsavec : Flag<["-"], "mno-xsavec">, Group<m_x86_Features_Group>;
	def mxsaveopt : Flag<["-"], "mxsaveopt">, Group<m_x86_Features_Group>;
	def mno_xsaveopt : Flag<["-"], "mno-xsaveopt">, Group<m_x86_Features_Group>;
	def mxsaves : Flag<["-"], "mxsaves">, Group<m_x86_Features_Group>;
	def mno_xsaves : Flag<["-"], "mno-xsaves">, Group<m_x86_Features_Group>;
	def mshstk : Flag<["-"], "mshstk">, Group<m_x86_Features_Group>;
	def mno_shstk : Flag<["-"], "mno-shstk">, Group<m_x86_Features_Group>;
	def mibt : Flag<["-"], "mibt">, Group<m_x86_Features_Group>;
	def mno_ibt : Flag<["-"], "mno-ibt">, Group<m_x86_Features_Group>;
	+def mretpoline : Flag<["-"], "mretpoline">, Group<m_x86_Features_Group>;
	+def mno_retpoline : Flag<["-"], "mno-retpoline">, Group<m_x86_Features_Group>;
	+def mretpoline_external_thunk : Flag<["-"], "mretpoline-external-thunk">, Group<m_x86_Features_Group>;
	+def mno_retpoline_external_thunk : Flag<["-"], "mno-retpoline-external-thunk">, Group<m_x86_Features_Group>;

	// These are legacy user-facing driver-level option spellings. They are always
	// aliases for options that are spelled using the more common Unix / GNU flag
	// style of double-dash and equals-joined flags.
	def gcc_toolchain_legacy_spelling : Separate<["-"], "gcc-toolchain">, Alias<gcc_toolchain>;
	def target_legacy_spelling : Separate<["-"], "target">, Alias<target>;

	// Special internal option to handle -Xlinker --no-demangle.
	def Z_Xlinker__no_demangle : Flag<["-"], "Z-Xlinker-no-demangle">,
	Flags<[Unsupported, NoArgumentUnused]>;

	// Special internal option to allow forwarding arbitrary arguments to linker.
	def Zlinker_input : Separate<["-"], "Zlinker-input">,
	Flags<[Unsupported, NoArgumentUnused]>;

	// Reserved library options.
	def Z_reserved_lib_stdcxx : Flag<["-"], "Z-reserved-lib-stdc++">,
	Flags<[LinkerInput, NoArgumentUnused, Unsupported]>, Group<reserved_lib_Group>;
	def Z_reserved_lib_cckext : Flag<["-"], "Z-reserved-lib-cckext">,
	Flags<[LinkerInput, NoArgumentUnused, Unsupported]>, Group<reserved_lib_Group>;

	// Ignored options
	// FIXME: multiclasess produce suffixes, not prefixes. This is fine for now
	// since it is only used in ignored options.
	multiclass BooleanFFlag<string name> {
	def _f : Flag<["-"], "f"#name>;
	def _fno : Flag<["-"], "fno-"#name>;
	}

	defm : BooleanFFlag<"keep-inline-functions">, Group<clang_ignored_gcc_optimization_f_Group>;

	def fprofile_dir : Joined<["-"], "fprofile-dir=">, Group<f_Group>;

	def fuse_ld_EQ : Joined<["-"], "fuse-ld=">, Group<f_Group>, Flags<[CoreOption]>;

	defm align_functions : BooleanFFlag<"align-functions">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_functions_EQ : Joined<["-"], "falign-functions=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm align_labels : BooleanFFlag<"align-labels">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_labels_EQ : Joined<["-"], "falign-labels=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm align_loops : BooleanFFlag<"align-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_loops_EQ : Joined<["-"], "falign-loops=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm align_jumps : BooleanFFlag<"align-jumps">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_jumps_EQ : Joined<["-"], "falign-jumps=">, Group<clang_ignored_gcc_optimization_f_Group>;

	// FIXME: This option should be supported and wired up to our diognostics, but
	// ignore it for now to avoid breaking builds that use it.
	def fdiagnostics_show_location_EQ : Joined<["-"], "fdiagnostics-show-location=">, Group<clang_ignored_f_Group>;

	defm fcheck_new : BooleanFFlag<"check-new">, Group<clang_ignored_f_Group>;
	defm caller_saves : BooleanFFlag<"caller-saves">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm reorder_blocks : BooleanFFlag<"reorder-blocks">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm eliminate_unused_debug_types : BooleanFFlag<"eliminate-unused-debug-types">, Group<clang_ignored_f_Group>;
	defm branch_count_reg : BooleanFFlag<"branch-count-reg">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm default_inline : BooleanFFlag<"default-inline">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm delete_null_pointer_checks : BooleanFFlag<"delete-null-pointer-checks">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm fat_lto_objects : BooleanFFlag<"fat-lto-objects">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm float_store : BooleanFFlag<"float-store">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm friend_injection : BooleanFFlag<"friend-injection">, Group<clang_ignored_f_Group>;
	defm function_attribute_list : BooleanFFlag<"function-attribute-list">, Group<clang_ignored_f_Group>;
	defm gcse : BooleanFFlag<"gcse">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gcse_after_reload: BooleanFFlag<"gcse-after-reload">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gcse_las: BooleanFFlag<"gcse-las">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gcse_sm: BooleanFFlag<"gcse-sm">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gnu : BooleanFFlag<"gnu">, Group<clang_ignored_f_Group>;
	defm ident : BooleanFFlag<"ident">, Group<clang_ignored_f_Group>;
	defm implicit_templates : BooleanFFlag<"implicit-templates">, Group<clang_ignored_f_Group>;
	defm implement_inlines : BooleanFFlag<"implement-inlines">, Group<clang_ignored_f_Group>;
	defm merge_constants : BooleanFFlag<"merge-constants">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm modulo_sched : BooleanFFlag<"modulo-sched">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm modulo_sched_allow_regmoves : BooleanFFlag<"modulo-sched-allow-regmoves">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm inline_functions_called_once : BooleanFFlag<"inline-functions-called-once">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	def finline_limit_EQ : Joined<["-"], "finline-limit=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm finline_limit : BooleanFFlag<"inline-limit">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm inline_small_functions : BooleanFFlag<"inline-small-functions">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm ipa_cp : BooleanFFlag<"ipa-cp">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm ivopts : BooleanFFlag<"ivopts">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm non_call_exceptions : BooleanFFlag<"non-call-exceptions">, Group<clang_ignored_f_Group>;
	defm peel_loops : BooleanFFlag<"peel-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm permissive : BooleanFFlag<"permissive">, Group<clang_ignored_f_Group>;
	defm prefetch_loop_arrays : BooleanFFlag<"prefetch-loop-arrays">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm printf : BooleanFFlag<"printf">, Group<clang_ignored_f_Group>;
	defm profile : BooleanFFlag<"profile">, Group<clang_ignored_f_Group>;
	defm profile_correction : BooleanFFlag<"profile-correction">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm profile_generate_sampling : BooleanFFlag<"profile-generate-sampling">, Group<clang_ignored_f_Group>;
	defm profile_reusedist : BooleanFFlag<"profile-reusedist">, Group<clang_ignored_f_Group>;
	defm profile_values : BooleanFFlag<"profile-values">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm regs_graph : BooleanFFlag<"regs-graph">, Group<clang_ignored_f_Group>;
	defm rename_registers : BooleanFFlag<"rename-registers">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm ripa : BooleanFFlag<"ripa">, Group<clang_ignored_f_Group>;
	defm rounding_math : BooleanFFlag<"rounding-math">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm schedule_insns : BooleanFFlag<"schedule-insns">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm schedule_insns2 : BooleanFFlag<"schedule-insns2">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm see : BooleanFFlag<"see">, Group<clang_ignored_f_Group>;
	defm signaling_nans : BooleanFFlag<"signaling-nans">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm single_precision_constant : BooleanFFlag<"single-precision-constant">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm spec_constr_count : BooleanFFlag<"spec-constr-count">, Group<clang_ignored_f_Group>;
	defm stack_check : BooleanFFlag<"stack-check">, Group<clang_ignored_f_Group>;
	defm strength_reduce :
	BooleanFFlag<"strength-reduce">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tls_model : BooleanFFlag<"tls-model">, Group<clang_ignored_f_Group>;
	defm tracer : BooleanFFlag<"tracer">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_dce : BooleanFFlag<"tree-dce">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_loop_im : BooleanFFlag<"tree_loop_im">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_loop_ivcanon : BooleanFFlag<"tree_loop_ivcanon">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_loop_linear : BooleanFFlag<"tree_loop_linear">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_salias : BooleanFFlag<"tree-salias">, Group<clang_ignored_f_Group>;
	defm tree_ter : BooleanFFlag<"tree-ter">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_vectorizer_verbose : BooleanFFlag<"tree-vectorizer-verbose">, Group<clang_ignored_f_Group>;
	defm tree_vrp : BooleanFFlag<"tree-vrp">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm unroll_all_loops : BooleanFFlag<"unroll-all-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm unsafe_loop_optimizations : BooleanFFlag<"unsafe-loop-optimizations">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm unswitch_loops : BooleanFFlag<"unswitch-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm use_linker_plugin : BooleanFFlag<"use-linker-plugin">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm vect_cost_model : BooleanFFlag<"vect-cost-model">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm variable_expansion_in_unroller : BooleanFFlag<"variable-expansion-in-unroller">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm web : BooleanFFlag<"web">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm whole_program : BooleanFFlag<"whole-program">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm devirtualize : BooleanFFlag<"devirtualize">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm devirtualize_speculatively : BooleanFFlag<"devirtualize-speculatively">,
	Group<clang_ignored_gcc_optimization_f_Group>;

	// Generic gfortran options.
	def A_DASH : Joined<["-"], "A-">, Group<gfortran_Group>;
	def J : JoinedOrSeparate<["-"], "J">, Flags<[RenderJoined]>, Group<gfortran_Group>;
	def cpp : Flag<["-"], "cpp">, Group<gfortran_Group>;
	def nocpp : Flag<["-"], "nocpp">, Group<gfortran_Group>;
	def static_libgfortran : Flag<["-"], "static-libgfortran">, Group<gfortran_Group>;

	// "f" options with values for gfortran.
	def fblas_matmul_limit_EQ : Joined<["-"], "fblas-matmul-limit=">, Group<gfortran_Group>;
	def fcheck_EQ : Joined<["-"], "fcheck=">, Group<gfortran_Group>;
	def fcoarray_EQ : Joined<["-"], "fcoarray=">, Group<gfortran_Group>;
	def fconvert_EQ : Joined<["-"], "fconvert=">, Group<gfortran_Group>;
	def ffixed_line_length_VALUE : Joined<["-"], "ffixed-line-length-">, Group<gfortran_Group>;
	def ffpe_trap_EQ : Joined<["-"], "ffpe-trap=">, Group<gfortran_Group>;
	def ffree_line_length_VALUE : Joined<["-"], "ffree-line-length-">, Group<gfortran_Group>;
	def finit_character_EQ : Joined<["-"], "finit-character=">, Group<gfortran_Group>;
	def finit_integer_EQ : Joined<["-"], "finit-integer=">, Group<gfortran_Group>;
	def finit_logical_EQ : Joined<["-"], "finit-logical=">, Group<gfortran_Group>;
	def finit_real_EQ : Joined<["-"], "finit-real=">, Group<gfortran_Group>;
	def fmax_array_constructor_EQ : Joined<["-"], "fmax-array-constructor=">, Group<gfortran_Group>;
	def fmax_errors_EQ : Joined<["-"], "fmax-errors=">, Group<gfortran_Group>;
	def fmax_stack_var_size_EQ : Joined<["-"], "fmax-stack-var-size=">, Group<gfortran_Group>;
	def fmax_subrecord_length_EQ : Joined<["-"], "fmax-subrecord-length=">, Group<gfortran_Group>;
	def frecord_marker_EQ : Joined<["-"], "frecord-marker=">, Group<gfortran_Group>;

	// "f" flags for gfortran.
	defm aggressive_function_elimination : BooleanFFlag<"aggressive-function-elimination">, Group<gfortran_Group>;
	defm align_commons : BooleanFFlag<"align-commons">, Group<gfortran_Group>;
	defm all_intrinsics : BooleanFFlag<"all-intrinsics">, Group<gfortran_Group>;
	defm automatic : BooleanFFlag<"automatic">, Group<gfortran_Group>;
	defm backslash : BooleanFFlag<"backslash">, Group<gfortran_Group>;
	defm backtrace : BooleanFFlag<"backtrace">, Group<gfortran_Group>;
	defm bounds_check : BooleanFFlag<"bounds-check">, Group<gfortran_Group>;
	defm check_array_temporaries : BooleanFFlag<"check-array-temporaries">, Group<gfortran_Group>;
	defm cray_pointer : BooleanFFlag<"cray-pointer">, Group<gfortran_Group>;
	defm d_lines_as_code : BooleanFFlag<"d-lines-as-code">, Group<gfortran_Group>;
	defm d_lines_as_comments : BooleanFFlag<"d-lines-as-comments">, Group<gfortran_Group>;
	defm default_double_8 : BooleanFFlag<"default-double-8">, Group<gfortran_Group>;
	defm default_integer_8 : BooleanFFlag<"default-integer-8">, Group<gfortran_Group>;
	defm default_real_8 : BooleanFFlag<"default-real-8">, Group<gfortran_Group>;
	defm dollar_ok : BooleanFFlag<"dollar-ok">, Group<gfortran_Group>;
	defm dump_fortran_optimized : BooleanFFlag<"dump-fortran-optimized">, Group<gfortran_Group>;
	defm dump_fortran_original : BooleanFFlag<"dump-fortran-original">, Group<gfortran_Group>;
	defm dump_parse_tree : BooleanFFlag<"dump-parse-tree">, Group<gfortran_Group>;
	defm external_blas : BooleanFFlag<"external-blas">, Group<gfortran_Group>;
	defm f2c : BooleanFFlag<"f2c">, Group<gfortran_Group>;
	defm fixed_form : BooleanFFlag<"fixed-form">, Group<gfortran_Group>;
	defm free_form : BooleanFFlag<"free-form">, Group<gfortran_Group>;
	defm frontend_optimize : BooleanFFlag<"frontend-optimize">, Group<gfortran_Group>;
	defm implicit_none : BooleanFFlag<"implicit-none">, Group<gfortran_Group>;
	defm init_local_zero : BooleanFFlag<"init-local-zero">, Group<gfortran_Group>;
	defm integer_4_integer_8 : BooleanFFlag<"integer-4-integer-8">, Group<gfortran_Group>;
	defm intrinsic_modules_path : BooleanFFlag<"intrinsic-modules-path">, Group<gfortran_Group>;
	defm max_identifier_length : BooleanFFlag<"max-identifier-length">, Group<gfortran_Group>;
	defm module_private : BooleanFFlag<"module-private">, Group<gfortran_Group>;
	defm pack_derived : BooleanFFlag<"pack-derived">, Group<gfortran_Group>;
	defm protect_parens : BooleanFFlag<"protect-parens">, Group<gfortran_Group>;
	defm range_check : BooleanFFlag<"range-check">, Group<gfortran_Group>;
	defm real_4_real_10 : BooleanFFlag<"real-4-real-10">, Group<gfortran_Group>;
	defm real_4_real_16 : BooleanFFlag<"real-4-real-16">, Group<gfortran_Group>;
	defm real_4_real_8 : BooleanFFlag<"real-4-real-8">, Group<gfortran_Group>;
	defm real_8_real_10 : BooleanFFlag<"real-8-real-10">, Group<gfortran_Group>;
	defm real_8_real_16 : BooleanFFlag<"real-8-real-16">, Group<gfortran_Group>;
	defm real_8_real_4 : BooleanFFlag<"real-8-real-4">, Group<gfortran_Group>;
	defm realloc_lhs : BooleanFFlag<"realloc-lhs">, Group<gfortran_Group>;
	defm recursive : BooleanFFlag<"recursive">, Group<gfortran_Group>;
	defm repack_arrays : BooleanFFlag<"repack-arrays">, Group<gfortran_Group>;
	defm second_underscore : BooleanFFlag<"second-underscore">, Group<gfortran_Group>;
	defm sign_zero : BooleanFFlag<"sign-zero">, Group<gfortran_Group>;
	defm stack_arrays : BooleanFFlag<"stack-arrays">, Group<gfortran_Group>;
	defm underscoring : BooleanFFlag<"underscoring">, Group<gfortran_Group>;
	defm whole_file : BooleanFFlag<"whole-file">, Group<gfortran_Group>;


	include "CC1Options.td"

	include "CLCompatOptions.td"
	Index: head/contrib/llvm/tools/clang/lib/Basic/Targets/X86.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Basic/Targets/X86.cpp (revision 328816)
	+++ head/contrib/llvm/tools/clang/lib/Basic/Targets/X86.cpp (revision 328817)
	@@ -1,1596 +1,1602 @@
	//===--- X86.cpp - Implement X86 target feature support -------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements X86 TargetInfo objects.
	//
	//===----------------------------------------------------------------------===//

	#include "X86.h"
	#include "clang/Basic/Builtins.h"
	#include "clang/Basic/Diagnostic.h"
	#include "clang/Basic/TargetBuiltins.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"

	namespace clang {
	namespace targets {

	const Builtin::Info BuiltinInfoX86[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr},
	#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \
	{#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE},
	#define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE) \
	{#ID, TYPE, ATTRS, HEADER, LANGS, FEATURE},
	#include "clang/Basic/BuiltinsX86.def"

	#define BUILTIN(ID, TYPE, ATTRS) \
	{#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr},
	#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \
	{#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE},
	#define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE) \
	{#ID, TYPE, ATTRS, HEADER, LANGS, FEATURE},
	#include "clang/Basic/BuiltinsX86_64.def"
	};

	static const char *const GCCRegNames[] = {
	"ax", "dx", "cx", "bx", "si", "di", "bp", "sp",
	"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
	"argp", "flags", "fpcr", "fpsr", "dirflag", "frame", "xmm0", "xmm1",
	"xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "mm0", "mm1",
	"mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "r8", "r9",
	"r10", "r11", "r12", "r13", "r14", "r15", "xmm8", "xmm9",
	"xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1",
	"ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9",
	"ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "xmm16", "xmm17",
	"xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23", "xmm24", "xmm25",
	"xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31", "ymm16", "ymm17",
	"ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23", "ymm24", "ymm25",
	"ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31", "zmm0", "zmm1",
	"zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9",
	"zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
	"zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25",
	"zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "k0", "k1",
	"k2", "k3", "k4", "k5", "k6", "k7",
	"cr0", "cr2", "cr3", "cr4", "cr8",
	"dr0", "dr1", "dr2", "dr3", "dr6", "dr7",
	"bnd0", "bnd1", "bnd2", "bnd3",
	};

	const TargetInfo::AddlRegName AddlRegNames[] = {
	{{"al", "ah", "eax", "rax"}, 0},
	{{"bl", "bh", "ebx", "rbx"}, 3},
	{{"cl", "ch", "ecx", "rcx"}, 2},
	{{"dl", "dh", "edx", "rdx"}, 1},
	{{"esi", "rsi"}, 4},
	{{"edi", "rdi"}, 5},
	{{"esp", "rsp"}, 7},
	{{"ebp", "rbp"}, 6},
	{{"r8d", "r8w", "r8b"}, 38},
	{{"r9d", "r9w", "r9b"}, 39},
	{{"r10d", "r10w", "r10b"}, 40},
	{{"r11d", "r11w", "r11b"}, 41},
	{{"r12d", "r12w", "r12b"}, 42},
	{{"r13d", "r13w", "r13b"}, 43},
	{{"r14d", "r14w", "r14b"}, 44},
	{{"r15d", "r15w", "r15b"}, 45},
	};

	} // namespace targets
	} // namespace clang

	using namespace clang;
	using namespace clang::targets;

	bool X86TargetInfo::setFPMath(StringRef Name) {
	if (Name == "387") {
	FPMath = FP_387;
	return true;
	}
	if (Name == "sse") {
	FPMath = FP_SSE;
	return true;
	}
	return false;
	}

	bool X86TargetInfo::initFeatureMap(
	llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const {
	// FIXME: This really should not be here.
	// X86_64 always has SSE2.
	if (getTriple().getArch() == llvm::Triple::x86_64)
	setFeatureEnabledImpl(Features, "sse2", true);

	const CPUKind Kind = getCPUKind(CPU);

	// Enable X87 for all X86 processors but Lakemont.
	if (Kind != CK_Lakemont)
	setFeatureEnabledImpl(Features, "x87", true);

	switch (Kind) {
	case CK_Generic:
	case CK_i386:
	case CK_i486:
	case CK_i586:
	case CK_Pentium:
	case CK_PentiumPro:
	case CK_Lakemont:
	break;

	case CK_PentiumMMX:
	case CK_Pentium2:
	case CK_K6:
	case CK_WinChipC6:
	setFeatureEnabledImpl(Features, "mmx", true);
	break;

	case CK_Icelake:
	setFeatureEnabledImpl(Features, "vaes", true);
	setFeatureEnabledImpl(Features, "gfni", true);
	setFeatureEnabledImpl(Features, "vpclmulqdq", true);
	setFeatureEnabledImpl(Features, "avx512bitalg", true);
	setFeatureEnabledImpl(Features, "avx512vnni", true);
	setFeatureEnabledImpl(Features, "avx512vbmi2", true);
	setFeatureEnabledImpl(Features, "avx512vpopcntdq", true);
	LLVM_FALLTHROUGH;
	case CK_Cannonlake:
	setFeatureEnabledImpl(Features, "avx512ifma", true);
	setFeatureEnabledImpl(Features, "avx512vbmi", true);
	setFeatureEnabledImpl(Features, "sha", true);
	LLVM_FALLTHROUGH;
	case CK_SkylakeServer:
	setFeatureEnabledImpl(Features, "avx512f", true);
	setFeatureEnabledImpl(Features, "avx512cd", true);
	setFeatureEnabledImpl(Features, "avx512dq", true);
	setFeatureEnabledImpl(Features, "avx512bw", true);
	setFeatureEnabledImpl(Features, "avx512vl", true);
	setFeatureEnabledImpl(Features, "pku", true);
	setFeatureEnabledImpl(Features, "clwb", true);
	LLVM_FALLTHROUGH;
	case CK_SkylakeClient:
	setFeatureEnabledImpl(Features, "xsavec", true);
	setFeatureEnabledImpl(Features, "xsaves", true);
	setFeatureEnabledImpl(Features, "mpx", true);
	setFeatureEnabledImpl(Features, "sgx", true);
	setFeatureEnabledImpl(Features, "clflushopt", true);
	setFeatureEnabledImpl(Features, "rtm", true);
	LLVM_FALLTHROUGH;
	case CK_Broadwell:
	setFeatureEnabledImpl(Features, "rdseed", true);
	setFeatureEnabledImpl(Features, "adx", true);
	setFeatureEnabledImpl(Features, "prfchw", true);
	LLVM_FALLTHROUGH;
	case CK_Haswell:
	setFeatureEnabledImpl(Features, "avx2", true);
	setFeatureEnabledImpl(Features, "lzcnt", true);
	setFeatureEnabledImpl(Features, "bmi", true);
	setFeatureEnabledImpl(Features, "bmi2", true);
	setFeatureEnabledImpl(Features, "fma", true);
	setFeatureEnabledImpl(Features, "movbe", true);
	LLVM_FALLTHROUGH;
	case CK_IvyBridge:
	setFeatureEnabledImpl(Features, "rdrnd", true);
	setFeatureEnabledImpl(Features, "f16c", true);
	setFeatureEnabledImpl(Features, "fsgsbase", true);
	LLVM_FALLTHROUGH;
	case CK_SandyBridge:
	setFeatureEnabledImpl(Features, "avx", true);
	setFeatureEnabledImpl(Features, "xsave", true);
	setFeatureEnabledImpl(Features, "xsaveopt", true);
	LLVM_FALLTHROUGH;
	case CK_Westmere:
	setFeatureEnabledImpl(Features, "aes", true);
	setFeatureEnabledImpl(Features, "pclmul", true);
	LLVM_FALLTHROUGH;
	case CK_Nehalem:
	setFeatureEnabledImpl(Features, "sse4.2", true);
	LLVM_FALLTHROUGH;
	case CK_Penryn:
	setFeatureEnabledImpl(Features, "sse4.1", true);
	LLVM_FALLTHROUGH;
	case CK_Core2:
	setFeatureEnabledImpl(Features, "ssse3", true);
	LLVM_FALLTHROUGH;
	case CK_Yonah:
	case CK_Prescott:
	case CK_Nocona:
	setFeatureEnabledImpl(Features, "sse3", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	LLVM_FALLTHROUGH;
	case CK_PentiumM:
	case CK_Pentium4:
	case CK_x86_64:
	setFeatureEnabledImpl(Features, "sse2", true);
	LLVM_FALLTHROUGH;
	case CK_Pentium3:
	case CK_C3_2:
	setFeatureEnabledImpl(Features, "sse", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	break;

	case CK_Goldmont:
	setFeatureEnabledImpl(Features, "sha", true);
	setFeatureEnabledImpl(Features, "rdrnd", true);
	setFeatureEnabledImpl(Features, "rdseed", true);
	setFeatureEnabledImpl(Features, "xsave", true);
	setFeatureEnabledImpl(Features, "xsaveopt", true);
	setFeatureEnabledImpl(Features, "xsavec", true);
	setFeatureEnabledImpl(Features, "xsaves", true);
	setFeatureEnabledImpl(Features, "clflushopt", true);
	setFeatureEnabledImpl(Features, "mpx", true);
	setFeatureEnabledImpl(Features, "fsgsbase", true);
	LLVM_FALLTHROUGH;
	case CK_Silvermont:
	setFeatureEnabledImpl(Features, "aes", true);
	setFeatureEnabledImpl(Features, "pclmul", true);
	setFeatureEnabledImpl(Features, "sse4.2", true);
	setFeatureEnabledImpl(Features, "prfchw", true);
	LLVM_FALLTHROUGH;
	case CK_Bonnell:
	setFeatureEnabledImpl(Features, "movbe", true);
	setFeatureEnabledImpl(Features, "ssse3", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	break;

	case CK_KNM:
	// TODO: Add avx5124fmaps/avx5124vnniw.
	setFeatureEnabledImpl(Features, "avx512vpopcntdq", true);
	LLVM_FALLTHROUGH;
	case CK_KNL:
	setFeatureEnabledImpl(Features, "avx512f", true);
	setFeatureEnabledImpl(Features, "avx512cd", true);
	setFeatureEnabledImpl(Features, "avx512er", true);
	setFeatureEnabledImpl(Features, "avx512pf", true);
	setFeatureEnabledImpl(Features, "prfchw", true);
	setFeatureEnabledImpl(Features, "prefetchwt1", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	setFeatureEnabledImpl(Features, "rdseed", true);
	setFeatureEnabledImpl(Features, "adx", true);
	setFeatureEnabledImpl(Features, "lzcnt", true);
	setFeatureEnabledImpl(Features, "bmi", true);
	setFeatureEnabledImpl(Features, "bmi2", true);
	setFeatureEnabledImpl(Features, "rtm", true);
	setFeatureEnabledImpl(Features, "fma", true);
	setFeatureEnabledImpl(Features, "rdrnd", true);
	setFeatureEnabledImpl(Features, "f16c", true);
	setFeatureEnabledImpl(Features, "fsgsbase", true);
	setFeatureEnabledImpl(Features, "aes", true);
	setFeatureEnabledImpl(Features, "pclmul", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	setFeatureEnabledImpl(Features, "xsaveopt", true);
	setFeatureEnabledImpl(Features, "xsave", true);
	setFeatureEnabledImpl(Features, "movbe", true);
	break;

	case CK_K6_2:
	case CK_K6_3:
	case CK_WinChip2:
	case CK_C3:
	setFeatureEnabledImpl(Features, "3dnow", true);
	break;

	case CK_AMDFAM10:
	setFeatureEnabledImpl(Features, "sse4a", true);
	setFeatureEnabledImpl(Features, "lzcnt", true);
	setFeatureEnabledImpl(Features, "popcnt", true);
	LLVM_FALLTHROUGH;
	case CK_K8SSE3:
	setFeatureEnabledImpl(Features, "sse3", true);
	LLVM_FALLTHROUGH;
	case CK_K8:
	setFeatureEnabledImpl(Features, "sse2", true);
	LLVM_FALLTHROUGH;
	case CK_AthlonXP:
	setFeatureEnabledImpl(Features, "sse", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	LLVM_FALLTHROUGH;
	case CK_Athlon:
	case CK_Geode:
	setFeatureEnabledImpl(Features, "3dnowa", true);
	break;

	case CK_BTVER2:
	setFeatureEnabledImpl(Features, "avx", true);
	setFeatureEnabledImpl(Features, "aes", true);
	setFeatureEnabledImpl(Features, "pclmul", true);
	setFeatureEnabledImpl(Features, "bmi", true);
	setFeatureEnabledImpl(Features, "f16c", true);
	setFeatureEnabledImpl(Features, "xsaveopt", true);
	setFeatureEnabledImpl(Features, "movbe", true);
	LLVM_FALLTHROUGH;
	case CK_BTVER1:
	setFeatureEnabledImpl(Features, "ssse3", true);
	setFeatureEnabledImpl(Features, "sse4a", true);
	setFeatureEnabledImpl(Features, "lzcnt", true);
	setFeatureEnabledImpl(Features, "popcnt", true);
	setFeatureEnabledImpl(Features, "prfchw", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	break;

	case CK_ZNVER1:
	setFeatureEnabledImpl(Features, "adx", true);
	setFeatureEnabledImpl(Features, "aes", true);
	setFeatureEnabledImpl(Features, "avx2", true);
	setFeatureEnabledImpl(Features, "bmi", true);
	setFeatureEnabledImpl(Features, "bmi2", true);
	setFeatureEnabledImpl(Features, "clflushopt", true);
	setFeatureEnabledImpl(Features, "clzero", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	setFeatureEnabledImpl(Features, "f16c", true);
	setFeatureEnabledImpl(Features, "fma", true);
	setFeatureEnabledImpl(Features, "fsgsbase", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	setFeatureEnabledImpl(Features, "lzcnt", true);
	setFeatureEnabledImpl(Features, "mwaitx", true);
	setFeatureEnabledImpl(Features, "movbe", true);
	setFeatureEnabledImpl(Features, "pclmul", true);
	setFeatureEnabledImpl(Features, "popcnt", true);
	setFeatureEnabledImpl(Features, "prfchw", true);
	setFeatureEnabledImpl(Features, "rdrnd", true);
	setFeatureEnabledImpl(Features, "rdseed", true);
	setFeatureEnabledImpl(Features, "sha", true);
	setFeatureEnabledImpl(Features, "sse4a", true);
	setFeatureEnabledImpl(Features, "xsave", true);
	setFeatureEnabledImpl(Features, "xsavec", true);
	setFeatureEnabledImpl(Features, "xsaveopt", true);
	setFeatureEnabledImpl(Features, "xsaves", true);
	break;

	case CK_BDVER4:
	setFeatureEnabledImpl(Features, "avx2", true);
	setFeatureEnabledImpl(Features, "bmi2", true);
	setFeatureEnabledImpl(Features, "mwaitx", true);
	LLVM_FALLTHROUGH;
	case CK_BDVER3:
	setFeatureEnabledImpl(Features, "fsgsbase", true);
	setFeatureEnabledImpl(Features, "xsaveopt", true);
	LLVM_FALLTHROUGH;
	case CK_BDVER2:
	setFeatureEnabledImpl(Features, "bmi", true);
	setFeatureEnabledImpl(Features, "fma", true);
	setFeatureEnabledImpl(Features, "f16c", true);
	setFeatureEnabledImpl(Features, "tbm", true);
	LLVM_FALLTHROUGH;
	case CK_BDVER1:
	// xop implies avx, sse4a and fma4.
	setFeatureEnabledImpl(Features, "xop", true);
	setFeatureEnabledImpl(Features, "lwp", true);
	setFeatureEnabledImpl(Features, "lzcnt", true);
	setFeatureEnabledImpl(Features, "aes", true);
	setFeatureEnabledImpl(Features, "pclmul", true);
	setFeatureEnabledImpl(Features, "prfchw", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	setFeatureEnabledImpl(Features, "xsave", true);
	break;
	}
	if (!TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec))
	return false;

	// Can't do this earlier because we need to be able to explicitly enable
	// or disable these features and the things that they depend upon.

	// Enable popcnt if sse4.2 is enabled and popcnt is not explicitly disabled.
	auto I = Features.find("sse4.2");
	if (I != Features.end() && I->getValue() &&
	std::find(FeaturesVec.begin(), FeaturesVec.end(), "-popcnt") ==
	FeaturesVec.end())
	Features["popcnt"] = true;

	// Enable prfchw if 3DNow! is enabled and prfchw is not explicitly disabled.
	I = Features.find("3dnow");
	if (I != Features.end() && I->getValue() &&
	std::find(FeaturesVec.begin(), FeaturesVec.end(), "-prfchw") ==
	FeaturesVec.end())
	Features["prfchw"] = true;

	// Additionally, if SSE is enabled and mmx is not explicitly disabled,
	// then enable MMX.
	I = Features.find("sse");
	if (I != Features.end() && I->getValue() &&
	std::find(FeaturesVec.begin(), FeaturesVec.end(), "-mmx") ==
	FeaturesVec.end())
	Features["mmx"] = true;

	return true;
	}

	void X86TargetInfo::setSSELevel(llvm::StringMap<bool> &Features,
	X86SSEEnum Level, bool Enabled) {
	if (Enabled) {
	switch (Level) {
	case AVX512F:
	Features["avx512f"] = Features["fma"] = Features["f16c"] = true;
	LLVM_FALLTHROUGH;
	case AVX2:
	Features["avx2"] = true;
	LLVM_FALLTHROUGH;
	case AVX:
	Features["avx"] = true;
	Features["xsave"] = true;
	LLVM_FALLTHROUGH;
	case SSE42:
	Features["sse4.2"] = true;
	LLVM_FALLTHROUGH;
	case SSE41:
	Features["sse4.1"] = true;
	LLVM_FALLTHROUGH;
	case SSSE3:
	Features["ssse3"] = true;
	LLVM_FALLTHROUGH;
	case SSE3:
	Features["sse3"] = true;
	LLVM_FALLTHROUGH;
	case SSE2:
	Features["sse2"] = true;
	LLVM_FALLTHROUGH;
	case SSE1:
	Features["sse"] = true;
	LLVM_FALLTHROUGH;
	case NoSSE:
	break;
	}
	return;
	}

	switch (Level) {
	case NoSSE:
	case SSE1:
	Features["sse"] = false;
	LLVM_FALLTHROUGH;
	case SSE2:
	Features["sse2"] = Features["pclmul"] = Features["aes"] = Features["sha"] =
	Features["gfni"] = false;
	LLVM_FALLTHROUGH;
	case SSE3:
	Features["sse3"] = false;
	setXOPLevel(Features, NoXOP, false);
	LLVM_FALLTHROUGH;
	case SSSE3:
	Features["ssse3"] = false;
	LLVM_FALLTHROUGH;
	case SSE41:
	Features["sse4.1"] = false;
	LLVM_FALLTHROUGH;
	case SSE42:
	Features["sse4.2"] = false;
	LLVM_FALLTHROUGH;
	case AVX:
	Features["fma"] = Features["avx"] = Features["f16c"] = Features["xsave"] =
	Features["xsaveopt"] = Features["vaes"] = Features["vpclmulqdq"] = false;
	setXOPLevel(Features, FMA4, false);
	LLVM_FALLTHROUGH;
	case AVX2:
	Features["avx2"] = false;
	LLVM_FALLTHROUGH;
	case AVX512F:
	Features["avx512f"] = Features["avx512cd"] = Features["avx512er"] =
	Features["avx512pf"] = Features["avx512dq"] = Features["avx512bw"] =
	Features["avx512vl"] = Features["avx512vbmi"] =
	Features["avx512ifma"] = Features["avx512vpopcntdq"] =
	Features["avx512bitalg"] = Features["avx512vnni"] =
	Features["avx512vbmi2"] = false;
	break;
	}
	}

	void X86TargetInfo::setMMXLevel(llvm::StringMap<bool> &Features,
	MMX3DNowEnum Level, bool Enabled) {
	if (Enabled) {
	switch (Level) {
	case AMD3DNowAthlon:
	Features["3dnowa"] = true;
	LLVM_FALLTHROUGH;
	case AMD3DNow:
	Features["3dnow"] = true;
	LLVM_FALLTHROUGH;
	case MMX:
	Features["mmx"] = true;
	LLVM_FALLTHROUGH;
	case NoMMX3DNow:
	break;
	}
	return;
	}

	switch (Level) {
	case NoMMX3DNow:
	case MMX:
	Features["mmx"] = false;
	LLVM_FALLTHROUGH;
	case AMD3DNow:
	Features["3dnow"] = false;
	LLVM_FALLTHROUGH;
	case AMD3DNowAthlon:
	Features["3dnowa"] = false;
	break;
	}
	}

	void X86TargetInfo::setXOPLevel(llvm::StringMap<bool> &Features, XOPEnum Level,
	bool Enabled) {
	if (Enabled) {
	switch (Level) {
	case XOP:
	Features["xop"] = true;
	LLVM_FALLTHROUGH;
	case FMA4:
	Features["fma4"] = true;
	setSSELevel(Features, AVX, true);
	LLVM_FALLTHROUGH;
	case SSE4A:
	Features["sse4a"] = true;
	setSSELevel(Features, SSE3, true);
	LLVM_FALLTHROUGH;
	case NoXOP:
	break;
	}
	return;
	}

	switch (Level) {
	case NoXOP:
	case SSE4A:
	Features["sse4a"] = false;
	LLVM_FALLTHROUGH;
	case FMA4:
	Features["fma4"] = false;
	LLVM_FALLTHROUGH;
	case XOP:
	Features["xop"] = false;
	break;
	}
	}

	void X86TargetInfo::setFeatureEnabledImpl(llvm::StringMap<bool> &Features,
	StringRef Name, bool Enabled) {
	// This is a bit of a hack to deal with the sse4 target feature when used
	// as part of the target attribute. We handle sse4 correctly everywhere
	// else. See below for more information on how we handle the sse4 options.
	if (Name != "sse4")
	Features[Name] = Enabled;

	if (Name == "mmx") {
	setMMXLevel(Features, MMX, Enabled);
	} else if (Name == "sse") {
	setSSELevel(Features, SSE1, Enabled);
	} else if (Name == "sse2") {
	setSSELevel(Features, SSE2, Enabled);
	} else if (Name == "sse3") {
	setSSELevel(Features, SSE3, Enabled);
	} else if (Name == "ssse3") {
	setSSELevel(Features, SSSE3, Enabled);
	} else if (Name == "sse4.2") {
	setSSELevel(Features, SSE42, Enabled);
	} else if (Name == "sse4.1") {
	setSSELevel(Features, SSE41, Enabled);
	} else if (Name == "3dnow") {
	setMMXLevel(Features, AMD3DNow, Enabled);
	} else if (Name == "3dnowa") {
	setMMXLevel(Features, AMD3DNowAthlon, Enabled);
	} else if (Name == "aes") {
	if (Enabled)
	setSSELevel(Features, SSE2, Enabled);
	else
	Features["vaes"] = false;
	} else if (Name == "vaes") {
	if (Enabled) {
	setSSELevel(Features, AVX, Enabled);
	Features["aes"] = true;
	}
	} else if (Name == "pclmul") {
	if (Enabled)
	setSSELevel(Features, SSE2, Enabled);
	else
	Features["vpclmulqdq"] = false;
	} else if (Name == "vpclmulqdq") {
	if (Enabled) {
	setSSELevel(Features, AVX, Enabled);
	Features["pclmul"] = true;
	}
	} else if (Name == "gfni") {
	if (Enabled)
	setSSELevel(Features, SSE2, Enabled);
	} else if (Name == "avx") {
	setSSELevel(Features, AVX, Enabled);
	} else if (Name == "avx2") {
	setSSELevel(Features, AVX2, Enabled);
	} else if (Name == "avx512f") {
	setSSELevel(Features, AVX512F, Enabled);
	} else if (Name == "avx512cd" \|\| Name == "avx512er" \|\| Name == "avx512pf" \|\|
	Name == "avx512dq" \|\| Name == "avx512bw" \|\| Name == "avx512vl" \|\|
	Name == "avx512vbmi" \|\| Name == "avx512ifma" \|\|
	Name == "avx512vpopcntdq" \|\| Name == "avx512bitalg" \|\|
	Name == "avx512vnni" \|\| Name == "avx512vbmi2") {
	if (Enabled)
	setSSELevel(Features, AVX512F, Enabled);
	// Enable BWI instruction if VBMI/VBMI2/BITALG is being enabled.
	if ((Name.startswith("avx512vbmi") \|\| Name == "avx512bitalg") && Enabled)
	Features["avx512bw"] = true;
	// Also disable VBMI/VBMI2/BITALG if BWI is being disabled.
	if (Name == "avx512bw" && !Enabled)
	Features["avx512vbmi"] = Features["avx512vbmi2"] =
	Features["avx512bitalg"] = false;
	} else if (Name == "fma") {
	if (Enabled)
	setSSELevel(Features, AVX, Enabled);
	else
	setSSELevel(Features, AVX512F, Enabled);
	} else if (Name == "fma4") {
	setXOPLevel(Features, FMA4, Enabled);
	} else if (Name == "xop") {
	setXOPLevel(Features, XOP, Enabled);
	} else if (Name == "sse4a") {
	setXOPLevel(Features, SSE4A, Enabled);
	} else if (Name == "f16c") {
	if (Enabled)
	setSSELevel(Features, AVX, Enabled);
	else
	setSSELevel(Features, AVX512F, Enabled);
	} else if (Name == "sha") {
	if (Enabled)
	setSSELevel(Features, SSE2, Enabled);
	} else if (Name == "sse4") {
	// We can get here via the __target__ attribute since that's not controlled
	// via the -msse4/-mno-sse4 command line alias. Handle this the same way
	// here - turn on the sse4.2 if enabled, turn off the sse4.1 level if
	// disabled.
	if (Enabled)
	setSSELevel(Features, SSE42, Enabled);
	else
	setSSELevel(Features, SSE41, Enabled);
	} else if (Name == "xsave") {
	if (!Enabled)
	Features["xsaveopt"] = false;
	} else if (Name == "xsaveopt" \|\| Name == "xsavec" \|\| Name == "xsaves") {
	if (Enabled)
	Features["xsave"] = true;
	}
	}

	/// handleTargetFeatures - Perform initialization based on the user
	/// configured set of features.
	bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) {
	for (const auto &Feature : Features) {
	if (Feature[0] != '+')
	continue;

	if (Feature == "+aes") {
	HasAES = true;
	} else if (Feature == "+vaes") {
	HasVAES = true;
	} else if (Feature == "+pclmul") {
	HasPCLMUL = true;
	} else if (Feature == "+vpclmulqdq") {
	HasVPCLMULQDQ = true;
	} else if (Feature == "+lzcnt") {
	HasLZCNT = true;
	} else if (Feature == "+rdrnd") {
	HasRDRND = true;
	} else if (Feature == "+fsgsbase") {
	HasFSGSBASE = true;
	} else if (Feature == "+bmi") {
	HasBMI = true;
	} else if (Feature == "+bmi2") {
	HasBMI2 = true;
	} else if (Feature == "+popcnt") {
	HasPOPCNT = true;
	} else if (Feature == "+rtm") {
	HasRTM = true;
	} else if (Feature == "+prfchw") {
	HasPRFCHW = true;
	} else if (Feature == "+rdseed") {
	HasRDSEED = true;
	} else if (Feature == "+adx") {
	HasADX = true;
	} else if (Feature == "+tbm") {
	HasTBM = true;
	} else if (Feature == "+lwp") {
	HasLWP = true;
	} else if (Feature == "+fma") {
	HasFMA = true;
	} else if (Feature == "+f16c") {
	HasF16C = true;
	} else if (Feature == "+gfni") {
	HasGFNI = true;
	} else if (Feature == "+avx512cd") {
	HasAVX512CD = true;
	} else if (Feature == "+avx512vpopcntdq") {
	HasAVX512VPOPCNTDQ = true;
	} else if (Feature == "+avx512vnni") {
	HasAVX512VNNI = true;
	} else if (Feature == "+avx512er") {
	HasAVX512ER = true;
	} else if (Feature == "+avx512pf") {
	HasAVX512PF = true;
	} else if (Feature == "+avx512dq") {
	HasAVX512DQ = true;
	} else if (Feature == "+avx512bitalg") {
	HasAVX512BITALG = true;
	} else if (Feature == "+avx512bw") {
	HasAVX512BW = true;
	} else if (Feature == "+avx512vl") {
	HasAVX512VL = true;
	} else if (Feature == "+avx512vbmi") {
	HasAVX512VBMI = true;
	} else if (Feature == "+avx512vbmi2") {
	HasAVX512VBMI2 = true;
	} else if (Feature == "+avx512ifma") {
	HasAVX512IFMA = true;
	} else if (Feature == "+sha") {
	HasSHA = true;
	} else if (Feature == "+mpx") {
	HasMPX = true;
	} else if (Feature == "+shstk") {
	HasSHSTK = true;
	} else if (Feature == "+ibt") {
	HasIBT = true;
	} else if (Feature == "+movbe") {
	HasMOVBE = true;
	} else if (Feature == "+sgx") {
	HasSGX = true;
	} else if (Feature == "+cx16") {
	HasCX16 = true;
	} else if (Feature == "+fxsr") {
	HasFXSR = true;
	} else if (Feature == "+xsave") {
	HasXSAVE = true;
	} else if (Feature == "+xsaveopt") {
	HasXSAVEOPT = true;
	} else if (Feature == "+xsavec") {
	HasXSAVEC = true;
	} else if (Feature == "+xsaves") {
	HasXSAVES = true;
	} else if (Feature == "+mwaitx") {
	HasMWAITX = true;
	} else if (Feature == "+pku") {
	HasPKU = true;
	} else if (Feature == "+clflushopt") {
	HasCLFLUSHOPT = true;
	} else if (Feature == "+clwb") {
	HasCLWB = true;
	} else if (Feature == "+prefetchwt1") {
	HasPREFETCHWT1 = true;
	} else if (Feature == "+clzero") {
	HasCLZERO = true;
	+ } else if (Feature == "+retpoline") {
	+ HasRetpoline = true;
	+ } else if (Feature == "+retpoline-external-thunk") {
	+ HasRetpolineExternalThunk = true;
	}

	X86SSEEnum Level = llvm::StringSwitch<X86SSEEnum>(Feature)
	.Case("+avx512f", AVX512F)
	.Case("+avx2", AVX2)
	.Case("+avx", AVX)
	.Case("+sse4.2", SSE42)
	.Case("+sse4.1", SSE41)
	.Case("+ssse3", SSSE3)
	.Case("+sse3", SSE3)
	.Case("+sse2", SSE2)
	.Case("+sse", SSE1)
	.Default(NoSSE);
	SSELevel = std::max(SSELevel, Level);

	MMX3DNowEnum ThreeDNowLevel = llvm::StringSwitch<MMX3DNowEnum>(Feature)
	.Case("+3dnowa", AMD3DNowAthlon)
	.Case("+3dnow", AMD3DNow)
	.Case("+mmx", MMX)
	.Default(NoMMX3DNow);
	MMX3DNowLevel = std::max(MMX3DNowLevel, ThreeDNowLevel);

	XOPEnum XLevel = llvm::StringSwitch<XOPEnum>(Feature)
	.Case("+xop", XOP)
	.Case("+fma4", FMA4)
	.Case("+sse4a", SSE4A)
	.Default(NoXOP);
	XOPLevel = std::max(XOPLevel, XLevel);
	}

	// LLVM doesn't have a separate switch for fpmath, so only accept it if it
	// matches the selected sse level.
	if ((FPMath == FP_SSE && SSELevel < SSE1) \|\|
	(FPMath == FP_387 && SSELevel >= SSE1)) {
	Diags.Report(diag::err_target_unsupported_fpmath)
	<< (FPMath == FP_SSE ? "sse" : "387");
	return false;
	}

	SimdDefaultAlign =
	hasFeature("avx512f") ? 512 : hasFeature("avx") ? 256 : 128;
	return true;
	}

	/// X86TargetInfo::getTargetDefines - Return the set of the X86-specific macro
	/// definitions for this particular subtarget.
	void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	// Target identification.
	if (getTriple().getArch() == llvm::Triple::x86_64) {
	Builder.defineMacro("__amd64__");
	Builder.defineMacro("__amd64");
	Builder.defineMacro("__x86_64");
	Builder.defineMacro("__x86_64__");
	if (getTriple().getArchName() == "x86_64h") {
	Builder.defineMacro("__x86_64h");
	Builder.defineMacro("__x86_64h__");
	}
	} else {
	DefineStd(Builder, "i386", Opts);
	}

	// Subtarget options.
	// FIXME: We are hard-coding the tune parameters based on the CPU, but they
	// truly should be based on -mtune options.
	switch (CPU) {
	case CK_Generic:
	break;
	case CK_i386:
	// The rest are coming from the i386 define above.
	Builder.defineMacro("__tune_i386__");
	break;
	case CK_i486:
	case CK_WinChipC6:
	case CK_WinChip2:
	case CK_C3:
	defineCPUMacros(Builder, "i486");
	break;
	case CK_PentiumMMX:
	Builder.defineMacro("__pentium_mmx__");
	Builder.defineMacro("__tune_pentium_mmx__");
	LLVM_FALLTHROUGH;
	case CK_i586:
	case CK_Pentium:
	defineCPUMacros(Builder, "i586");
	defineCPUMacros(Builder, "pentium");
	break;
	case CK_Pentium3:
	case CK_PentiumM:
	Builder.defineMacro("__tune_pentium3__");
	LLVM_FALLTHROUGH;
	case CK_Pentium2:
	case CK_C3_2:
	Builder.defineMacro("__tune_pentium2__");
	LLVM_FALLTHROUGH;
	case CK_PentiumPro:
	defineCPUMacros(Builder, "i686");
	defineCPUMacros(Builder, "pentiumpro");
	break;
	case CK_Pentium4:
	defineCPUMacros(Builder, "pentium4");
	break;
	case CK_Yonah:
	case CK_Prescott:
	case CK_Nocona:
	defineCPUMacros(Builder, "nocona");
	break;
	case CK_Core2:
	case CK_Penryn:
	defineCPUMacros(Builder, "core2");
	break;
	case CK_Bonnell:
	defineCPUMacros(Builder, "atom");
	break;
	case CK_Silvermont:
	defineCPUMacros(Builder, "slm");
	break;
	case CK_Goldmont:
	defineCPUMacros(Builder, "goldmont");
	break;
	case CK_Nehalem:
	case CK_Westmere:
	case CK_SandyBridge:
	case CK_IvyBridge:
	case CK_Haswell:
	case CK_Broadwell:
	case CK_SkylakeClient:
	case CK_SkylakeServer:
	case CK_Cannonlake:
	case CK_Icelake:
	// FIXME: Historically, we defined this legacy name, it would be nice to
	// remove it at some point. We've never exposed fine-grained names for
	// recent primary x86 CPUs, and we should keep it that way.
	defineCPUMacros(Builder, "corei7");
	break;
	case CK_KNL:
	defineCPUMacros(Builder, "knl");
	break;
	case CK_KNM:
	break;
	case CK_Lakemont:
	defineCPUMacros(Builder, "i586", /Tuning/false);
	defineCPUMacros(Builder, "pentium", /Tuning/false);
	Builder.defineMacro("__tune_lakemont__");
	break;
	case CK_K6_2:
	Builder.defineMacro("__k6_2__");
	Builder.defineMacro("__tune_k6_2__");
	LLVM_FALLTHROUGH;
	case CK_K6_3:
	if (CPU != CK_K6_2) { // In case of fallthrough
	// FIXME: GCC may be enabling these in cases where some other k6
	// architecture is specified but -m3dnow is explicitly provided. The
	// exact semantics need to be determined and emulated here.
	Builder.defineMacro("__k6_3__");
	Builder.defineMacro("__tune_k6_3__");
	}
	LLVM_FALLTHROUGH;
	case CK_K6:
	defineCPUMacros(Builder, "k6");
	break;
	case CK_Athlon:
	case CK_AthlonXP:
	defineCPUMacros(Builder, "athlon");
	if (SSELevel != NoSSE) {
	Builder.defineMacro("__athlon_sse__");
	Builder.defineMacro("__tune_athlon_sse__");
	}
	break;
	case CK_K8:
	case CK_K8SSE3:
	case CK_x86_64:
	defineCPUMacros(Builder, "k8");
	break;
	case CK_AMDFAM10:
	defineCPUMacros(Builder, "amdfam10");
	break;
	case CK_BTVER1:
	defineCPUMacros(Builder, "btver1");
	break;
	case CK_BTVER2:
	defineCPUMacros(Builder, "btver2");
	break;
	case CK_BDVER1:
	defineCPUMacros(Builder, "bdver1");
	break;
	case CK_BDVER2:
	defineCPUMacros(Builder, "bdver2");
	break;
	case CK_BDVER3:
	defineCPUMacros(Builder, "bdver3");
	break;
	case CK_BDVER4:
	defineCPUMacros(Builder, "bdver4");
	break;
	case CK_ZNVER1:
	defineCPUMacros(Builder, "znver1");
	break;
	case CK_Geode:
	defineCPUMacros(Builder, "geode");
	break;
	}

	// Target properties.
	Builder.defineMacro("__REGISTER_PREFIX__", "");

	// Define __NO_MATH_INLINES on linux/x86 so that we don't get inline
	// functions in glibc header files that use FP Stack inline asm which the
	// backend can't deal with (PR879).
	Builder.defineMacro("__NO_MATH_INLINES");

	if (HasAES)
	Builder.defineMacro("__AES__");

	if (HasVAES)
	Builder.defineMacro("__VAES__");

	if (HasPCLMUL)
	Builder.defineMacro("__PCLMUL__");

	if (HasVPCLMULQDQ)
	Builder.defineMacro("__VPCLMULQDQ__");

	if (HasLZCNT)
	Builder.defineMacro("__LZCNT__");

	if (HasRDRND)
	Builder.defineMacro("__RDRND__");

	if (HasFSGSBASE)
	Builder.defineMacro("__FSGSBASE__");

	if (HasBMI)
	Builder.defineMacro("__BMI__");

	if (HasBMI2)
	Builder.defineMacro("__BMI2__");

	if (HasPOPCNT)
	Builder.defineMacro("__POPCNT__");

	if (HasRTM)
	Builder.defineMacro("__RTM__");

	if (HasPRFCHW)
	Builder.defineMacro("__PRFCHW__");

	if (HasRDSEED)
	Builder.defineMacro("__RDSEED__");

	if (HasADX)
	Builder.defineMacro("__ADX__");

	if (HasTBM)
	Builder.defineMacro("__TBM__");

	if (HasLWP)
	Builder.defineMacro("__LWP__");

	if (HasMWAITX)
	Builder.defineMacro("__MWAITX__");

	switch (XOPLevel) {
	case XOP:
	Builder.defineMacro("__XOP__");
	LLVM_FALLTHROUGH;
	case FMA4:
	Builder.defineMacro("__FMA4__");
	LLVM_FALLTHROUGH;
	case SSE4A:
	Builder.defineMacro("__SSE4A__");
	LLVM_FALLTHROUGH;
	case NoXOP:
	break;
	}

	if (HasFMA)
	Builder.defineMacro("__FMA__");

	if (HasF16C)
	Builder.defineMacro("__F16C__");

	if (HasGFNI)
	Builder.defineMacro("__GFNI__");

	if (HasAVX512CD)
	Builder.defineMacro("__AVX512CD__");
	if (HasAVX512VPOPCNTDQ)
	Builder.defineMacro("__AVX512VPOPCNTDQ__");
	if (HasAVX512VNNI)
	Builder.defineMacro("__AVX512VNNI__");
	if (HasAVX512ER)
	Builder.defineMacro("__AVX512ER__");
	if (HasAVX512PF)
	Builder.defineMacro("__AVX512PF__");
	if (HasAVX512DQ)
	Builder.defineMacro("__AVX512DQ__");
	if (HasAVX512BITALG)
	Builder.defineMacro("__AVX512BITALG__");
	if (HasAVX512BW)
	Builder.defineMacro("__AVX512BW__");
	if (HasAVX512VL)
	Builder.defineMacro("__AVX512VL__");
	if (HasAVX512VBMI)
	Builder.defineMacro("__AVX512VBMI__");
	if (HasAVX512VBMI2)
	Builder.defineMacro("__AVX512VBMI2__");
	if (HasAVX512IFMA)
	Builder.defineMacro("__AVX512IFMA__");

	if (HasSHA)
	Builder.defineMacro("__SHA__");

	if (HasFXSR)
	Builder.defineMacro("__FXSR__");
	if (HasXSAVE)
	Builder.defineMacro("__XSAVE__");
	if (HasXSAVEOPT)
	Builder.defineMacro("__XSAVEOPT__");
	if (HasXSAVEC)
	Builder.defineMacro("__XSAVEC__");
	if (HasXSAVES)
	Builder.defineMacro("__XSAVES__");
	if (HasPKU)
	Builder.defineMacro("__PKU__");
	if (HasCX16)
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16");
	if (HasCLFLUSHOPT)
	Builder.defineMacro("__CLFLUSHOPT__");
	if (HasCLWB)
	Builder.defineMacro("__CLWB__");
	if (HasMPX)
	Builder.defineMacro("__MPX__");
	if (HasSHSTK)
	Builder.defineMacro("__SHSTK__");
	if (HasSGX)
	Builder.defineMacro("__SGX__");
	if (HasPREFETCHWT1)
	Builder.defineMacro("__PREFETCHWT1__");
	if (HasCLZERO)
	Builder.defineMacro("__CLZERO__");

	// Each case falls through to the previous one here.
	switch (SSELevel) {
	case AVX512F:
	Builder.defineMacro("__AVX512F__");
	LLVM_FALLTHROUGH;
	case AVX2:
	Builder.defineMacro("__AVX2__");
	LLVM_FALLTHROUGH;
	case AVX:
	Builder.defineMacro("__AVX__");
	LLVM_FALLTHROUGH;
	case SSE42:
	Builder.defineMacro("__SSE4_2__");
	LLVM_FALLTHROUGH;
	case SSE41:
	Builder.defineMacro("__SSE4_1__");
	LLVM_FALLTHROUGH;
	case SSSE3:
	Builder.defineMacro("__SSSE3__");
	LLVM_FALLTHROUGH;
	case SSE3:
	Builder.defineMacro("__SSE3__");
	LLVM_FALLTHROUGH;
	case SSE2:
	Builder.defineMacro("__SSE2__");
	Builder.defineMacro("__SSE2_MATH__"); // -mfp-math=sse always implied.
	LLVM_FALLTHROUGH;
	case SSE1:
	Builder.defineMacro("__SSE__");
	Builder.defineMacro("__SSE_MATH__"); // -mfp-math=sse always implied.
	LLVM_FALLTHROUGH;
	case NoSSE:
	break;
	}

	if (Opts.MicrosoftExt && getTriple().getArch() == llvm::Triple::x86) {
	switch (SSELevel) {
	case AVX512F:
	case AVX2:
	case AVX:
	case SSE42:
	case SSE41:
	case SSSE3:
	case SSE3:
	case SSE2:
	Builder.defineMacro("_M_IX86_FP", Twine(2));
	break;
	case SSE1:
	Builder.defineMacro("_M_IX86_FP", Twine(1));
	break;
	default:
	Builder.defineMacro("_M_IX86_FP", Twine(0));
	break;
	}
	}

	// Each case falls through to the previous one here.
	switch (MMX3DNowLevel) {
	case AMD3DNowAthlon:
	Builder.defineMacro("__3dNOW_A__");
	LLVM_FALLTHROUGH;
	case AMD3DNow:
	Builder.defineMacro("__3dNOW__");
	LLVM_FALLTHROUGH;
	case MMX:
	Builder.defineMacro("__MMX__");
	LLVM_FALLTHROUGH;
	case NoMMX3DNow:
	break;
	}

	if (CPU >= CK_i486) {
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");
	}
	if (CPU >= CK_i586)
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");

	if (HasFloat128)
	Builder.defineMacro("__SIZEOF_FLOAT128__", "16");
	}

	bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
	return llvm::StringSwitch<bool>(Name)
	.Case("3dnow", true)
	.Case("3dnowa", true)
	.Case("adx", true)
	.Case("aes", true)
	.Case("avx", true)
	.Case("avx2", true)
	.Case("avx512f", true)
	.Case("avx512cd", true)
	.Case("avx512vpopcntdq", true)
	.Case("avx512vnni", true)
	.Case("avx512er", true)
	.Case("avx512pf", true)
	.Case("avx512dq", true)
	.Case("avx512bitalg", true)
	.Case("avx512bw", true)
	.Case("avx512vl", true)
	.Case("avx512vbmi", true)
	.Case("avx512vbmi2", true)
	.Case("avx512ifma", true)
	.Case("bmi", true)
	.Case("bmi2", true)
	.Case("clflushopt", true)
	.Case("clwb", true)
	.Case("clzero", true)
	.Case("cx16", true)
	.Case("f16c", true)
	.Case("fma", true)
	.Case("fma4", true)
	.Case("fsgsbase", true)
	.Case("fxsr", true)
	.Case("gfni", true)
	.Case("lwp", true)
	.Case("lzcnt", true)
	.Case("mmx", true)
	.Case("movbe", true)
	.Case("mpx", true)
	.Case("mwaitx", true)
	.Case("pclmul", true)
	.Case("pku", true)
	.Case("popcnt", true)
	.Case("prefetchwt1", true)
	.Case("prfchw", true)
	.Case("rdrnd", true)
	.Case("rdseed", true)
	.Case("rtm", true)
	.Case("sgx", true)
	.Case("sha", true)
	.Case("shstk", true)
	.Case("sse", true)
	.Case("sse2", true)
	.Case("sse3", true)
	.Case("ssse3", true)
	.Case("sse4", true)
	.Case("sse4.1", true)
	.Case("sse4.2", true)
	.Case("sse4a", true)
	.Case("tbm", true)
	.Case("vaes", true)
	.Case("vpclmulqdq", true)
	.Case("x87", true)
	.Case("xop", true)
	.Case("xsave", true)
	.Case("xsavec", true)
	.Case("xsaves", true)
	.Case("xsaveopt", true)
	.Default(false);
	}

	bool X86TargetInfo::hasFeature(StringRef Feature) const {
	return llvm::StringSwitch<bool>(Feature)
	.Case("adx", HasADX)
	.Case("aes", HasAES)
	.Case("avx", SSELevel >= AVX)
	.Case("avx2", SSELevel >= AVX2)
	.Case("avx512f", SSELevel >= AVX512F)
	.Case("avx512cd", HasAVX512CD)
	.Case("avx512vpopcntdq", HasAVX512VPOPCNTDQ)
	.Case("avx512vnni", HasAVX512VNNI)
	.Case("avx512er", HasAVX512ER)
	.Case("avx512pf", HasAVX512PF)
	.Case("avx512dq", HasAVX512DQ)
	.Case("avx512bitalg", HasAVX512BITALG)
	.Case("avx512bw", HasAVX512BW)
	.Case("avx512vl", HasAVX512VL)
	.Case("avx512vbmi", HasAVX512VBMI)
	.Case("avx512vbmi2", HasAVX512VBMI2)
	.Case("avx512ifma", HasAVX512IFMA)
	.Case("bmi", HasBMI)
	.Case("bmi2", HasBMI2)
	.Case("clflushopt", HasCLFLUSHOPT)
	.Case("clwb", HasCLWB)
	.Case("clzero", HasCLZERO)
	.Case("cx16", HasCX16)
	.Case("f16c", HasF16C)
	.Case("fma", HasFMA)
	.Case("fma4", XOPLevel >= FMA4)
	.Case("fsgsbase", HasFSGSBASE)
	.Case("fxsr", HasFXSR)
	.Case("gfni", HasGFNI)
	.Case("ibt", HasIBT)
	.Case("lwp", HasLWP)
	.Case("lzcnt", HasLZCNT)
	.Case("mm3dnow", MMX3DNowLevel >= AMD3DNow)
	.Case("mm3dnowa", MMX3DNowLevel >= AMD3DNowAthlon)
	.Case("mmx", MMX3DNowLevel >= MMX)
	.Case("movbe", HasMOVBE)
	.Case("mpx", HasMPX)
	.Case("mwaitx", HasMWAITX)
	.Case("pclmul", HasPCLMUL)
	.Case("pku", HasPKU)
	.Case("popcnt", HasPOPCNT)
	.Case("prefetchwt1", HasPREFETCHWT1)
	.Case("prfchw", HasPRFCHW)
	.Case("rdrnd", HasRDRND)
	.Case("rdseed", HasRDSEED)
	+ .Case("retpoline", HasRetpoline)
	+ .Case("retpoline-external-thunk", HasRetpolineExternalThunk)
	.Case("rtm", HasRTM)
	.Case("sgx", HasSGX)
	.Case("sha", HasSHA)
	.Case("shstk", HasSHSTK)
	.Case("sse", SSELevel >= SSE1)
	.Case("sse2", SSELevel >= SSE2)
	.Case("sse3", SSELevel >= SSE3)
	.Case("ssse3", SSELevel >= SSSE3)
	.Case("sse4.1", SSELevel >= SSE41)
	.Case("sse4.2", SSELevel >= SSE42)
	.Case("sse4a", XOPLevel >= SSE4A)
	.Case("tbm", HasTBM)
	.Case("vaes", HasVAES)
	.Case("vpclmulqdq", HasVPCLMULQDQ)
	.Case("x86", true)
	.Case("x86_32", getTriple().getArch() == llvm::Triple::x86)
	.Case("x86_64", getTriple().getArch() == llvm::Triple::x86_64)
	.Case("xop", XOPLevel >= XOP)
	.Case("xsave", HasXSAVE)
	.Case("xsavec", HasXSAVEC)
	.Case("xsaves", HasXSAVES)
	.Case("xsaveopt", HasXSAVEOPT)
	.Default(false);
	}

	// We can't use a generic validation scheme for the features accepted here
	// versus subtarget features accepted in the target attribute because the
	// bitfield structure that's initialized in the runtime only supports the
	// below currently rather than the full range of subtarget features. (See
	// X86TargetInfo::hasFeature for a somewhat comprehensive list).
	bool X86TargetInfo::validateCpuSupports(StringRef FeatureStr) const {
	return llvm::StringSwitch<bool>(FeatureStr)
	#define X86_FEATURE_COMPAT(VAL, ENUM, STR) .Case(STR, true)
	#include "llvm/Support/X86TargetParser.def"
	.Default(false);
	}

	// We can't use a generic validation scheme for the cpus accepted here
	// versus subtarget cpus accepted in the target attribute because the
	// variables intitialized by the runtime only support the below currently
	// rather than the full range of cpus.
	bool X86TargetInfo::validateCpuIs(StringRef FeatureStr) const {
	return llvm::StringSwitch<bool>(FeatureStr)
	#define X86_VENDOR(ENUM, STRING) .Case(STRING, true)
	#define X86_CPU_TYPE_COMPAT_WITH_ALIAS(ARCHNAME, ENUM, STR, ALIAS) \
	.Cases(STR, ALIAS, true)
	#define X86_CPU_TYPE_COMPAT(ARCHNAME, ENUM, STR) .Case(STR, true)
	#define X86_CPU_SUBTYPE_COMPAT(ARCHNAME, ENUM, STR) .Case(STR, true)
	#include "llvm/Support/X86TargetParser.def"
	.Default(false);
	}

	bool X86TargetInfo::validateAsmConstraint(
	const char *&Name, TargetInfo::ConstraintInfo &Info) const {
	switch (*Name) {
	default:
	return false;
	// Constant constraints.
	case 'e': // 32-bit signed integer constant for use with sign-extending x86_64
	// instructions.
	case 'Z': // 32-bit unsigned integer constant for use with zero-extending
	// x86_64 instructions.
	case 's':
	Info.setRequiresImmediate();
	return true;
	case 'I':
	Info.setRequiresImmediate(0, 31);
	return true;
	case 'J':
	Info.setRequiresImmediate(0, 63);
	return true;
	case 'K':
	Info.setRequiresImmediate(-128, 127);
	return true;
	case 'L':
	Info.setRequiresImmediate({int(0xff), int(0xffff), int(0xffffffff)});
	return true;
	case 'M':
	Info.setRequiresImmediate(0, 3);
	return true;
	case 'N':
	Info.setRequiresImmediate(0, 255);
	return true;
	case 'O':
	Info.setRequiresImmediate(0, 127);
	return true;
	// Register constraints.
	case 'Y': // 'Y' is the first character for several 2-character constraints.
	// Shift the pointer to the second character of the constraint.
	Name++;
	switch (*Name) {
	default:
	return false;
	case 'z':
	case '0': // First SSE register.
	case '2':
	case 't': // Any SSE register, when SSE2 is enabled.
	case 'i': // Any SSE register, when SSE2 and inter-unit moves enabled.
	case 'm': // Any MMX register, when inter-unit moves enabled.
	case 'k': // AVX512 arch mask registers: k1-k7.
	Info.setAllowsRegister();
	return true;
	}
	case 'f': // Any x87 floating point stack register.
	// Constraint 'f' cannot be used for output operands.
	if (Info.ConstraintStr[0] == '=')
	return false;
	Info.setAllowsRegister();
	return true;
	case 'a': // eax.
	case 'b': // ebx.
	case 'c': // ecx.
	case 'd': // edx.
	case 'S': // esi.
	case 'D': // edi.
	case 'A': // edx:eax.
	case 't': // Top of floating point stack.
	case 'u': // Second from top of floating point stack.
	case 'q': // Any register accessible as [r]l: a, b, c, and d.
	case 'y': // Any MMX register.
	case 'v': // Any {X,Y,Z}MM register (Arch & context dependent)
	case 'x': // Any SSE register.
	case 'k': // Any AVX512 mask register (same as Yk, additionaly allows k0
	// for intermideate k reg operations).
	case 'Q': // Any register accessible as [r]h: a, b, c, and d.
	case 'R': // "Legacy" registers: ax, bx, cx, dx, di, si, sp, bp.
	case 'l': // "Index" registers: any general register that can be used as an
	// index in a base+index memory access.
	Info.setAllowsRegister();
	return true;
	// Floating point constant constraints.
	case 'C': // SSE floating point constant.
	case 'G': // x87 floating point constant.
	return true;
	}
	}

	bool X86TargetInfo::validateOutputSize(StringRef Constraint,
	unsigned Size) const {
	// Strip off constraint modifiers.
	while (Constraint[0] == '=' \|\| Constraint[0] == '+' \|\| Constraint[0] == '&')
	Constraint = Constraint.substr(1);

	return validateOperandSize(Constraint, Size);
	}

	bool X86TargetInfo::validateInputSize(StringRef Constraint,
	unsigned Size) const {
	return validateOperandSize(Constraint, Size);
	}

	bool X86TargetInfo::validateOperandSize(StringRef Constraint,
	unsigned Size) const {
	switch (Constraint[0]) {
	default:
	break;
	case 'k':
	// Registers k0-k7 (AVX512) size limit is 64 bit.
	case 'y':
	return Size <= 64;
	case 'f':
	case 't':
	case 'u':
	return Size <= 128;
	case 'Y':
	// 'Y' is the first character for several 2-character constraints.
	switch (Constraint[1]) {
	default:
	return false;
	case 'm':
	// 'Ym' is synonymous with 'y'.
	case 'k':
	return Size <= 64;
	case 'z':
	case '0':
	// XMM0
	if (SSELevel >= SSE1)
	return Size <= 128U;
	return false;
	case 'i':
	case 't':
	case '2':
	// 'Yi','Yt','Y2' are synonymous with 'x' when SSE2 is enabled.
	if (SSELevel < SSE2)
	return false;
	break;
	}
	case 'v':
	case 'x':
	if (SSELevel >= AVX512F)
	// 512-bit zmm registers can be used if target supports AVX512F.
	return Size <= 512U;
	else if (SSELevel >= AVX)
	// 256-bit ymm registers can be used if target supports AVX.
	return Size <= 256U;
	return Size <= 128U;

	}

	return true;
	}

	std::string X86TargetInfo::convertConstraint(const char *&Constraint) const {
	switch (*Constraint) {
	case 'a':
	return std::string("{ax}");
	case 'b':
	return std::string("{bx}");
	case 'c':
	return std::string("{cx}");
	case 'd':
	return std::string("{dx}");
	case 'S':
	return std::string("{si}");
	case 'D':
	return std::string("{di}");
	case 'p': // address
	return std::string("im");
	case 't': // top of floating point stack.
	return std::string("{st}");
	case 'u': // second from top of floating point stack.
	return std::string("{st(1)}"); // second from top of floating point stack.
	case 'Y':
	switch (Constraint[1]) {
	default:
	// Break from inner switch and fall through (copy single char),
	// continue parsing after copying the current constraint into
	// the return string.
	break;
	case 'k':
	case 'm':
	case 'i':
	case 't':
	case 'z':
	case '0':
	case '2':
	// "^" hints llvm that this is a 2 letter constraint.
	// "Constraint++" is used to promote the string iterator
	// to the next constraint.
	return std::string("^") + std::string(Constraint++, 2);
	}
	LLVM_FALLTHROUGH;
	default:
	return std::string(1, *Constraint);
	}
	}

	bool X86TargetInfo::checkCPUKind(CPUKind Kind) const {
	// Perform any per-CPU checks necessary to determine if this CPU is
	// acceptable.
	// FIXME: This results in terrible diagnostics. Clang just says the CPU is
	// invalid without explaining why.
	switch (Kind) {
	case CK_Generic:
	// No processor selected!
	return false;
	#define PROC(ENUM, STRING, IS64BIT) \
	case CK_##ENUM: \
	return IS64BIT \|\| getTriple().getArch() == llvm::Triple::x86;
	#include "clang/Basic/X86Target.def"
	}
	llvm_unreachable("Unhandled CPU kind");
	}

	X86TargetInfo::CPUKind X86TargetInfo::getCPUKind(StringRef CPU) const {
	return llvm::StringSwitch<CPUKind>(CPU)
	#define PROC(ENUM, STRING, IS64BIT) .Case(STRING, CK_##ENUM)
	#define PROC_ALIAS(ENUM, ALIAS) .Case(ALIAS, CK_##ENUM)
	#include "clang/Basic/X86Target.def"
	.Default(CK_Generic);
	}

	ArrayRef<const char *> X86TargetInfo::getGCCRegNames() const {
	return llvm::makeArrayRef(GCCRegNames);
	}

	ArrayRef<TargetInfo::AddlRegName> X86TargetInfo::getGCCAddlRegNames() const {
	return llvm::makeArrayRef(AddlRegNames);
	}

	ArrayRef<Builtin::Info> X86_32TargetInfo::getTargetBuiltins() const {
	return llvm::makeArrayRef(BuiltinInfoX86, clang::X86::LastX86CommonBuiltin -
	Builtin::FirstTSBuiltin + 1);
	}

	ArrayRef<Builtin::Info> X86_64TargetInfo::getTargetBuiltins() const {
	return llvm::makeArrayRef(BuiltinInfoX86,
	X86::LastTSBuiltin - Builtin::FirstTSBuiltin);
	}
	Index: head/contrib/llvm/tools/clang/lib/Basic/Targets/X86.h
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Basic/Targets/X86.h (revision 328816)
	+++ head/contrib/llvm/tools/clang/lib/Basic/Targets/X86.h (revision 328817)
	@@ -1,807 +1,809 @@
	//===--- X86.h - Declare X86 target feature support -------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares X86 TargetInfo objects.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_X86_H
	#define LLVM_CLANG_LIB_BASIC_TARGETS_X86_H

	#include "OSTargets.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Basic/TargetOptions.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Support/Compiler.h"

	namespace clang {
	namespace targets {

	// X86 target abstract base class; x86-32 and x86-64 are very close, so
	// most of the implementation can be shared.
	class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {

	enum X86SSEEnum {
	NoSSE,
	SSE1,
	SSE2,
	SSE3,
	SSSE3,
	SSE41,
	SSE42,
	AVX,
	AVX2,
	AVX512F
	} SSELevel = NoSSE;
	enum MMX3DNowEnum {
	NoMMX3DNow,
	MMX,
	AMD3DNow,
	AMD3DNowAthlon
	} MMX3DNowLevel = NoMMX3DNow;
	enum XOPEnum { NoXOP, SSE4A, FMA4, XOP } XOPLevel = NoXOP;

	bool HasAES = false;
	bool HasVAES = false;
	bool HasPCLMUL = false;
	bool HasVPCLMULQDQ = false;
	bool HasGFNI = false;
	bool HasLZCNT = false;
	bool HasRDRND = false;
	bool HasFSGSBASE = false;
	bool HasBMI = false;
	bool HasBMI2 = false;
	bool HasPOPCNT = false;
	bool HasRTM = false;
	bool HasPRFCHW = false;
	bool HasRDSEED = false;
	bool HasADX = false;
	bool HasTBM = false;
	bool HasLWP = false;
	bool HasFMA = false;
	bool HasF16C = false;
	bool HasAVX512CD = false;
	bool HasAVX512VPOPCNTDQ = false;
	bool HasAVX512VNNI = false;
	bool HasAVX512ER = false;
	bool HasAVX512PF = false;
	bool HasAVX512DQ = false;
	bool HasAVX512BITALG = false;
	bool HasAVX512BW = false;
	bool HasAVX512VL = false;
	bool HasAVX512VBMI = false;
	bool HasAVX512VBMI2 = false;
	bool HasAVX512IFMA = false;
	bool HasSHA = false;
	bool HasMPX = false;
	bool HasSHSTK = false;
	bool HasIBT = false;
	bool HasSGX = false;
	bool HasCX16 = false;
	bool HasFXSR = false;
	bool HasXSAVE = false;
	bool HasXSAVEOPT = false;
	bool HasXSAVEC = false;
	bool HasXSAVES = false;
	bool HasMWAITX = false;
	bool HasCLZERO = false;
	bool HasPKU = false;
	bool HasCLFLUSHOPT = false;
	bool HasCLWB = false;
	bool HasMOVBE = false;
	bool HasPREFETCHWT1 = false;
	+ bool HasRetpoline = false;
	+ bool HasRetpolineExternalThunk = false;

	/// \brief Enumeration of all of the X86 CPUs supported by Clang.
	///
	/// Each enumeration represents a particular CPU supported by Clang. These
	/// loosely correspond to the options passed to '-march' or '-mtune' flags.
	enum CPUKind {
	CK_Generic,
	#define PROC(ENUM, STRING, IS64BIT) CK_##ENUM,
	#include "clang/Basic/X86Target.def"
	} CPU = CK_Generic;

	bool checkCPUKind(CPUKind Kind) const;

	CPUKind getCPUKind(StringRef CPU) const;

	enum FPMathKind { FP_Default, FP_SSE, FP_387 } FPMath = FP_Default;

	public:
	X86TargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple) {
	LongDoubleFormat = &llvm::APFloat::x87DoubleExtended();
	}

	unsigned getFloatEvalMethod() const override {
	// X87 evaluates with 80 bits "long double" precision.
	return SSELevel == NoSSE ? 2 : 0;
	}

	ArrayRef<const char *> getGCCRegNames() const override;

	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	return None;
	}

	ArrayRef<TargetInfo::AddlRegName> getGCCAddlRegNames() const override;

	bool validateCpuSupports(StringRef Name) const override;

	bool validateCpuIs(StringRef Name) const override;

	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &info) const override;

	bool validateGlobalRegisterVariable(StringRef RegName, unsigned RegSize,
	bool &HasSizeMismatch) const override {
	// esp and ebp are the only 32-bit registers the x86 backend can currently
	// handle.
	if (RegName.equals("esp") \|\| RegName.equals("ebp")) {
	// Check that the register size is 32-bit.
	HasSizeMismatch = RegSize != 32;
	return true;
	}

	return false;
	}

	bool validateOutputSize(StringRef Constraint, unsigned Size) const override;

	bool validateInputSize(StringRef Constraint, unsigned Size) const override;

	virtual bool validateOperandSize(StringRef Constraint, unsigned Size) const;

	std::string convertConstraint(const char *&Constraint) const override;
	const char *getClobbers() const override {
	return "~{dirflag},~{fpsr},~{flags}";
	}

	StringRef getConstraintRegister(const StringRef &Constraint,
	const StringRef &Expression) const override {
	StringRef::iterator I, E;
	for (I = Constraint.begin(), E = Constraint.end(); I != E; ++I) {
	if (isalpha(*I))
	break;
	}
	if (I == E)
	return "";
	switch (*I) {
	// For the register constraints, return the matching register name
	case 'a':
	return "ax";
	case 'b':
	return "bx";
	case 'c':
	return "cx";
	case 'd':
	return "dx";
	case 'S':
	return "si";
	case 'D':
	return "di";
	// In case the constraint is 'r' we need to return Expression
	case 'r':
	return Expression;
	// Double letters Y<x> constraints
	case 'Y':
	if ((++I != E) && ((I == '0') \|\| (I == 'z')))
	return "xmm0";
	default:
	break;
	}
	return "";
	}

	bool useFP16ConversionIntrinsics() const override {
	return false;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override;

	static void setSSELevel(llvm::StringMap<bool> &Features, X86SSEEnum Level,
	bool Enabled);

	static void setMMXLevel(llvm::StringMap<bool> &Features, MMX3DNowEnum Level,
	bool Enabled);

	static void setXOPLevel(llvm::StringMap<bool> &Features, XOPEnum Level,
	bool Enabled);

	void setFeatureEnabled(llvm::StringMap<bool> &Features, StringRef Name,
	bool Enabled) const override {
	setFeatureEnabledImpl(Features, Name, Enabled);
	}

	// This exists purely to cut down on the number of virtual calls in
	// initFeatureMap which calls this repeatedly.
	static void setFeatureEnabledImpl(llvm::StringMap<bool> &Features,
	StringRef Name, bool Enabled);

	bool
	initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
	StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const override;

	bool isValidFeatureName(StringRef Name) const override;

	bool hasFeature(StringRef Feature) const override;

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override;

	StringRef getABI() const override {
	if (getTriple().getArch() == llvm::Triple::x86_64 && SSELevel >= AVX512F)
	return "avx512";
	if (getTriple().getArch() == llvm::Triple::x86_64 && SSELevel >= AVX)
	return "avx";
	if (getTriple().getArch() == llvm::Triple::x86 &&
	MMX3DNowLevel == NoMMX3DNow)
	return "no-mmx";
	return "";
	}

	bool isValidCPUName(StringRef Name) const override {
	return checkCPUKind(getCPUKind(Name));
	}

	bool setCPU(const std::string &Name) override {
	return checkCPUKind(CPU = getCPUKind(Name));
	}

	bool setFPMath(StringRef Name) override;

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	// Most of the non-ARM calling conventions are i386 conventions.
	switch (CC) {
	case CC_X86ThisCall:
	case CC_X86FastCall:
	case CC_X86StdCall:
	case CC_X86VectorCall:
	case CC_X86RegCall:
	case CC_C:
	case CC_Swift:
	case CC_X86Pascal:
	case CC_IntelOclBicc:
	case CC_OpenCLKernel:
	return CCCR_OK;
	default:
	return CCCR_Warning;
	}
	}

	CallingConv getDefaultCallingConv(CallingConvMethodType MT) const override {
	return MT == CCMT_Member ? CC_X86ThisCall : CC_C;
	}

	bool hasSjLjLowering() const override { return true; }

	void setSupportedOpenCLOpts() override {
	getSupportedOpenCLOpts().supportAll();
	}
	};

	// X86-32 generic target
	class LLVM_LIBRARY_VISIBILITY X86_32TargetInfo : public X86TargetInfo {
	public:
	X86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: X86TargetInfo(Triple, Opts) {
	DoubleAlign = LongLongAlign = 32;
	LongDoubleWidth = 96;
	LongDoubleAlign = 32;
	SuitableAlign = 128;
	resetDataLayout("e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128");
	SizeType = UnsignedInt;
	PtrDiffType = SignedInt;
	IntPtrType = SignedInt;
	RegParmMax = 3;

	// Use fpret for all types.
	RealTypeUsesObjCFPRet =
	((1 << TargetInfo::Float) \| (1 << TargetInfo::Double) \|
	(1 << TargetInfo::LongDouble));

	// x86-32 has atomics up to 8 bytes
	// FIXME: Check that we actually have cmpxchg8b before setting
	// MaxAtomicInlineWidth. (cmpxchg8b is an i586 instruction.)
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}

	int getEHDataRegisterNumber(unsigned RegNo) const override {
	if (RegNo == 0)
	return 0;
	if (RegNo == 1)
	return 2;
	return -1;
	}

	bool validateOperandSize(StringRef Constraint, unsigned Size) const override {
	switch (Constraint[0]) {
	default:
	break;
	case 'R':
	case 'q':
	case 'Q':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	return Size <= 32;
	case 'A':
	return Size <= 64;
	}

	return X86TargetInfo::validateOperandSize(Constraint, Size);
	}

	ArrayRef<Builtin::Info> getTargetBuiltins() const override;
	};

	class LLVM_LIBRARY_VISIBILITY NetBSDI386TargetInfo
	: public NetBSDTargetInfo<X86_32TargetInfo> {
	public:
	NetBSDI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: NetBSDTargetInfo<X86_32TargetInfo>(Triple, Opts) {}

	unsigned getFloatEvalMethod() const override {
	unsigned Major, Minor, Micro;
	getTriple().getOSVersion(Major, Minor, Micro);
	// New NetBSD uses the default rounding mode.
	if (Major >= 7 \|\| (Major == 6 && Minor == 99 && Micro >= 26) \|\| Major == 0)
	return X86_32TargetInfo::getFloatEvalMethod();
	// NetBSD before 6.99.26 defaults to "double" rounding.
	return 1;
	}
	};

	class LLVM_LIBRARY_VISIBILITY OpenBSDI386TargetInfo
	: public OpenBSDTargetInfo<X86_32TargetInfo> {
	public:
	OpenBSDI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OpenBSDTargetInfo<X86_32TargetInfo>(Triple, Opts) {
	SizeType = UnsignedLong;
	IntPtrType = SignedLong;
	PtrDiffType = SignedLong;
	}
	};

	class LLVM_LIBRARY_VISIBILITY DarwinI386TargetInfo
	: public DarwinTargetInfo<X86_32TargetInfo> {
	public:
	DarwinI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: DarwinTargetInfo<X86_32TargetInfo>(Triple, Opts) {
	LongDoubleWidth = 128;
	LongDoubleAlign = 128;
	SuitableAlign = 128;
	MaxVectorAlign = 256;
	// The watchOS simulator uses the builtin bool type for Objective-C.
	llvm::Triple T = llvm::Triple(Triple);
	if (T.isWatchOS())
	UseSignedCharForObjCBool = false;
	SizeType = UnsignedLong;
	IntPtrType = SignedLong;
	resetDataLayout("e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128");
	HasAlignMac68kSupport = true;
	}

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override {
	if (!DarwinTargetInfo<X86_32TargetInfo>::handleTargetFeatures(Features,
	Diags))
	return false;
	// We now know the features we have: we can decide how to align vectors.
	MaxVectorAlign =
	hasFeature("avx512f") ? 512 : hasFeature("avx") ? 256 : 128;
	return true;
	}
	};

	// x86-32 Windows target
	class LLVM_LIBRARY_VISIBILITY WindowsX86_32TargetInfo
	: public WindowsTargetInfo<X86_32TargetInfo> {
	public:
	WindowsX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: WindowsTargetInfo<X86_32TargetInfo>(Triple, Opts) {
	DoubleAlign = LongLongAlign = 64;
	bool IsWinCOFF =
	getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF();
	resetDataLayout(IsWinCOFF
	? "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
	: "e-m:e-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32");
	}
	};

	// x86-32 Windows Visual Studio target
	class LLVM_LIBRARY_VISIBILITY MicrosoftX86_32TargetInfo
	: public WindowsX86_32TargetInfo {
	public:
	MicrosoftX86_32TargetInfo(const llvm::Triple &Triple,
	const TargetOptions &Opts)
	: WindowsX86_32TargetInfo(Triple, Opts) {
	LongDoubleWidth = LongDoubleAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WindowsX86_32TargetInfo::getTargetDefines(Opts, Builder);
	WindowsX86_32TargetInfo::getVisualStudioDefines(Opts, Builder);
	// The value of the following reflects processor type.
	// 300=386, 400=486, 500=Pentium, 600=Blend (default)
	// We lost the original triple, so we use the default.
	Builder.defineMacro("_M_IX86", "600");
	}
	};

	// x86-32 MinGW target
	class LLVM_LIBRARY_VISIBILITY MinGWX86_32TargetInfo
	: public WindowsX86_32TargetInfo {
	public:
	MinGWX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: WindowsX86_32TargetInfo(Triple, Opts) {
	HasFloat128 = true;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WindowsX86_32TargetInfo::getTargetDefines(Opts, Builder);
	Builder.defineMacro("_X86_");
	}
	};

	// x86-32 Cygwin target
	class LLVM_LIBRARY_VISIBILITY CygwinX86_32TargetInfo : public X86_32TargetInfo {
	public:
	CygwinX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: X86_32TargetInfo(Triple, Opts) {
	this->WCharType = TargetInfo::UnsignedShort;
	DoubleAlign = LongLongAlign = 64;
	resetDataLayout("e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32");
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	X86_32TargetInfo::getTargetDefines(Opts, Builder);
	Builder.defineMacro("_X86_");
	Builder.defineMacro("__CYGWIN__");
	Builder.defineMacro("__CYGWIN32__");
	addCygMingDefines(Opts, Builder);
	DefineStd(Builder, "unix", Opts);
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	}
	};

	// x86-32 Haiku target
	class LLVM_LIBRARY_VISIBILITY HaikuX86_32TargetInfo
	: public HaikuTargetInfo<X86_32TargetInfo> {
	public:
	HaikuX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: HaikuTargetInfo<X86_32TargetInfo>(Triple, Opts) {}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	HaikuTargetInfo<X86_32TargetInfo>::getTargetDefines(Opts, Builder);
	Builder.defineMacro("__INTEL__");
	}
	};

	// X86-32 MCU target
	class LLVM_LIBRARY_VISIBILITY MCUX86_32TargetInfo : public X86_32TargetInfo {
	public:
	MCUX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: X86_32TargetInfo(Triple, Opts) {
	LongDoubleWidth = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	resetDataLayout("e-m:e-p:32:32-i64:32-f64:32-f128:32-n8:16:32-a:0:32-S32");
	WIntType = UnsignedInt;
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	// On MCU we support only C calling convention.
	return CC == CC_C ? CCCR_OK : CCCR_Warning;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	X86_32TargetInfo::getTargetDefines(Opts, Builder);
	Builder.defineMacro("__iamcu");
	Builder.defineMacro("__iamcu__");
	}

	bool allowsLargerPreferedTypeAlignment() const override { return false; }
	};

	// x86-32 RTEMS target
	class LLVM_LIBRARY_VISIBILITY RTEMSX86_32TargetInfo : public X86_32TargetInfo {
	public:
	RTEMSX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: X86_32TargetInfo(Triple, Opts) {
	SizeType = UnsignedLong;
	IntPtrType = SignedLong;
	PtrDiffType = SignedLong;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	X86_32TargetInfo::getTargetDefines(Opts, Builder);
	Builder.defineMacro("__INTEL__");
	Builder.defineMacro("__rtems__");
	}
	};

	// x86-64 generic target
	class LLVM_LIBRARY_VISIBILITY X86_64TargetInfo : public X86TargetInfo {
	public:
	X86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: X86TargetInfo(Triple, Opts) {
	const bool IsX32 = getTriple().getEnvironment() == llvm::Triple::GNUX32;
	bool IsWinCOFF =
	getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF();
	LongWidth = LongAlign = PointerWidth = PointerAlign = IsX32 ? 32 : 64;
	LongDoubleWidth = 128;
	LongDoubleAlign = 128;
	LargeArrayMinWidth = 128;
	LargeArrayAlign = 128;
	SuitableAlign = 128;
	SizeType = IsX32 ? UnsignedInt : UnsignedLong;
	PtrDiffType = IsX32 ? SignedInt : SignedLong;
	IntPtrType = IsX32 ? SignedInt : SignedLong;
	IntMaxType = IsX32 ? SignedLongLong : SignedLong;
	Int64Type = IsX32 ? SignedLongLong : SignedLong;
	RegParmMax = 6;

	// Pointers are 32-bit in x32.
	resetDataLayout(IsX32
	? "e-m:e-p:32:32-i64:64-f80:128-n8:16:32:64-S128"
	: IsWinCOFF ? "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
	: "e-m:e-i64:64-f80:128-n8:16:32:64-S128");

	// Use fpret only for long double.
	RealTypeUsesObjCFPRet = (1 << TargetInfo::LongDouble);

	// Use fp2ret for _Complex long double.
	ComplexLongDoubleUsesFP2Ret = true;

	// Make __builtin_ms_va_list available.
	HasBuiltinMSVaList = true;

	// x86-64 has atomics up to 16 bytes.
	MaxAtomicPromoteWidth = 128;
	MaxAtomicInlineWidth = 64;
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::X86_64ABIBuiltinVaList;
	}

	int getEHDataRegisterNumber(unsigned RegNo) const override {
	if (RegNo == 0)
	return 0;
	if (RegNo == 1)
	return 1;
	return -1;
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	switch (CC) {
	case CC_C:
	case CC_Swift:
	case CC_X86VectorCall:
	case CC_IntelOclBicc:
	case CC_Win64:
	case CC_PreserveMost:
	case CC_PreserveAll:
	case CC_X86RegCall:
	case CC_OpenCLKernel:
	return CCCR_OK;
	default:
	return CCCR_Warning;
	}
	}

	CallingConv getDefaultCallingConv(CallingConvMethodType MT) const override {
	return CC_C;
	}

	// for x32 we need it here explicitly
	bool hasInt128Type() const override { return true; }

	unsigned getUnwindWordWidth() const override { return 64; }

	unsigned getRegisterWidth() const override { return 64; }

	bool validateGlobalRegisterVariable(StringRef RegName, unsigned RegSize,
	bool &HasSizeMismatch) const override {
	// rsp and rbp are the only 64-bit registers the x86 backend can currently
	// handle.
	if (RegName.equals("rsp") \|\| RegName.equals("rbp")) {
	// Check that the register size is 64-bit.
	HasSizeMismatch = RegSize != 64;
	return true;
	}

	// Check if the register is a 32-bit register the backend can handle.
	return X86TargetInfo::validateGlobalRegisterVariable(RegName, RegSize,
	HasSizeMismatch);
	}

	void setMaxAtomicWidth() override {
	if (hasFeature("cx16"))
	MaxAtomicInlineWidth = 128;
	}

	ArrayRef<Builtin::Info> getTargetBuiltins() const override;
	};

	// x86-64 Windows target
	class LLVM_LIBRARY_VISIBILITY WindowsX86_64TargetInfo
	: public WindowsTargetInfo<X86_64TargetInfo> {
	public:
	WindowsX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: WindowsTargetInfo<X86_64TargetInfo>(Triple, Opts) {
	LongWidth = LongAlign = 32;
	DoubleAlign = LongLongAlign = 64;
	IntMaxType = SignedLongLong;
	Int64Type = SignedLongLong;
	SizeType = UnsignedLongLong;
	PtrDiffType = SignedLongLong;
	IntPtrType = SignedLongLong;
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	switch (CC) {
	case CC_X86StdCall:
	case CC_X86ThisCall:
	case CC_X86FastCall:
	return CCCR_Ignore;
	case CC_C:
	case CC_X86VectorCall:
	case CC_IntelOclBicc:
	case CC_PreserveMost:
	case CC_PreserveAll:
	case CC_X86_64SysV:
	case CC_Swift:
	case CC_X86RegCall:
	case CC_OpenCLKernel:
	return CCCR_OK;
	default:
	return CCCR_Warning;
	}
	}
	};

	// x86-64 Windows Visual Studio target
	class LLVM_LIBRARY_VISIBILITY MicrosoftX86_64TargetInfo
	: public WindowsX86_64TargetInfo {
	public:
	MicrosoftX86_64TargetInfo(const llvm::Triple &Triple,
	const TargetOptions &Opts)
	: WindowsX86_64TargetInfo(Triple, Opts) {
	LongDoubleWidth = LongDoubleAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WindowsX86_64TargetInfo::getTargetDefines(Opts, Builder);
	WindowsX86_64TargetInfo::getVisualStudioDefines(Opts, Builder);
	Builder.defineMacro("_M_X64", "100");
	Builder.defineMacro("_M_AMD64", "100");
	}
	};

	// x86-64 MinGW target
	class LLVM_LIBRARY_VISIBILITY MinGWX86_64TargetInfo
	: public WindowsX86_64TargetInfo {
	public:
	MinGWX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: WindowsX86_64TargetInfo(Triple, Opts) {
	// Mingw64 rounds long double size and alignment up to 16 bytes, but sticks
	// with x86 FP ops. Weird.
	LongDoubleWidth = LongDoubleAlign = 128;
	LongDoubleFormat = &llvm::APFloat::x87DoubleExtended();
	HasFloat128 = true;
	}
	};

	// x86-64 Cygwin target
	class LLVM_LIBRARY_VISIBILITY CygwinX86_64TargetInfo : public X86_64TargetInfo {
	public:
	CygwinX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: X86_64TargetInfo(Triple, Opts) {
	this->WCharType = TargetInfo::UnsignedShort;
	TLSSupported = false;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	X86_64TargetInfo::getTargetDefines(Opts, Builder);
	Builder.defineMacro("__x86_64__");
	Builder.defineMacro("__CYGWIN__");
	Builder.defineMacro("__CYGWIN64__");
	addCygMingDefines(Opts, Builder);
	DefineStd(Builder, "unix", Opts);
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	}
	};

	class LLVM_LIBRARY_VISIBILITY DarwinX86_64TargetInfo
	: public DarwinTargetInfo<X86_64TargetInfo> {
	public:
	DarwinX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: DarwinTargetInfo<X86_64TargetInfo>(Triple, Opts) {
	Int64Type = SignedLongLong;
	// The 64-bit iOS simulator uses the builtin bool type for Objective-C.
	llvm::Triple T = llvm::Triple(Triple);
	if (T.isiOS())
	UseSignedCharForObjCBool = false;
	resetDataLayout("e-m:o-i64:64-f80:128-n8:16:32:64-S128");
	}

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override {
	if (!DarwinTargetInfo<X86_64TargetInfo>::handleTargetFeatures(Features,
	Diags))
	return false;
	// We now know the features we have: we can decide how to align vectors.
	MaxVectorAlign =
	hasFeature("avx512f") ? 512 : hasFeature("avx") ? 256 : 128;
	return true;
	}
	};

	class LLVM_LIBRARY_VISIBILITY OpenBSDX86_64TargetInfo
	: public OpenBSDTargetInfo<X86_64TargetInfo> {
	public:
	OpenBSDX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OpenBSDTargetInfo<X86_64TargetInfo>(Triple, Opts) {
	IntMaxType = SignedLongLong;
	Int64Type = SignedLongLong;
	}
	};

	// x86_32 Android target
	class LLVM_LIBRARY_VISIBILITY AndroidX86_32TargetInfo
	: public LinuxTargetInfo<X86_32TargetInfo> {
	public:
	AndroidX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: LinuxTargetInfo<X86_32TargetInfo>(Triple, Opts) {
	SuitableAlign = 32;
	LongDoubleWidth = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	}
	};

	// x86_64 Android target
	class LLVM_LIBRARY_VISIBILITY AndroidX86_64TargetInfo
	: public LinuxTargetInfo<X86_64TargetInfo> {
	public:
	AndroidX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: LinuxTargetInfo<X86_64TargetInfo>(Triple, Opts) {
	LongDoubleFormat = &llvm::APFloat::IEEEquad();
	}

	bool useFloat128ManglingForLongDouble() const override { return true; }
	};
	} // namespace targets
	} // namespace clang
	#endif // LLVM_CLANG_LIB_BASIC_TARGETS_X86_H
	Index: head/contrib/llvm/tools/clang
	===================================================================
	--- head/contrib/llvm/tools/clang (revision 328816)
	+++ head/contrib/llvm/tools/clang (revision 328817)

	Property changes on: head/contrib/llvm/tools/clang
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/clang/dist-release_60:r328750-328794
	Index: head/contrib/llvm/tools/lld/ELF/Arch/X86.cpp
	===================================================================
	--- head/contrib/llvm/tools/lld/ELF/Arch/X86.cpp (revision 328816)
	+++ head/contrib/llvm/tools/lld/ELF/Arch/X86.cpp (revision 328817)
	@@ -1,405 +1,543 @@
	//===- X86.cpp ------------------------------------------------------------===//
	//
	// The LLVM Linker
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "InputFiles.h"
	#include "Symbols.h"
	#include "SyntheticSections.h"
	#include "Target.h"
	#include "lld/Common/ErrorHandler.h"
	#include "llvm/Support/Endian.h"

	using namespace llvm;
	using namespace llvm::support::endian;
	using namespace llvm::ELF;
	using namespace lld;
	using namespace lld::elf;

	namespace {
	-class X86 final : public TargetInfo {
	+class X86 : public TargetInfo {
	public:
	X86();
	RelExpr getRelExpr(RelType Type, const Symbol &S,
	const uint8_t *Loc) const override;
	int64_t getImplicitAddend(const uint8_t *Buf, RelType Type) const override;
	void writeGotPltHeader(uint8_t *Buf) const override;
	RelType getDynRel(RelType Type) const override;
	void writeGotPlt(uint8_t *Buf, const Symbol &S) const override;
	void writeIgotPlt(uint8_t *Buf, const Symbol &S) const override;
	void writePltHeader(uint8_t *Buf) const override;
	void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr,
	int32_t Index, unsigned RelOff) const override;
	void relocateOne(uint8_t *Loc, RelType Type, uint64_t Val) const override;

	RelExpr adjustRelaxExpr(RelType Type, const uint8_t *Data,
	RelExpr Expr) const override;
	void relaxTlsGdToIe(uint8_t *Loc, RelType Type, uint64_t Val) const override;
	void relaxTlsGdToLe(uint8_t *Loc, RelType Type, uint64_t Val) const override;
	void relaxTlsIeToLe(uint8_t *Loc, RelType Type, uint64_t Val) const override;
	void relaxTlsLdToLe(uint8_t *Loc, RelType Type, uint64_t Val) const override;
	};
	} // namespace

	X86::X86() {
	GotBaseSymOff = -1;
	CopyRel = R_386_COPY;
	GotRel = R_386_GLOB_DAT;
	PltRel = R_386_JUMP_SLOT;
	IRelativeRel = R_386_IRELATIVE;
	RelativeRel = R_386_RELATIVE;
	TlsGotRel = R_386_TLS_TPOFF;
	TlsModuleIndexRel = R_386_TLS_DTPMOD32;
	TlsOffsetRel = R_386_TLS_DTPOFF32;
	GotEntrySize = 4;
	GotPltEntrySize = 4;
	PltEntrySize = 16;
	PltHeaderSize = 16;
	TlsGdRelaxSkip = 2;
	TrapInstr = 0xcccccccc; // 0xcc = INT3
	}

	static bool hasBaseReg(uint8_t ModRM) { return (ModRM & 0xc7) != 0x5; }

	RelExpr X86::getRelExpr(RelType Type, const Symbol &S,
	const uint8_t *Loc) const {
	switch (Type) {
	case R_386_8:
	case R_386_16:
	case R_386_32:
	case R_386_TLS_LDO_32:
	return R_ABS;
	case R_386_TLS_GD:
	return R_TLSGD;
	case R_386_TLS_LDM:
	return R_TLSLD;
	case R_386_PLT32:
	return R_PLT_PC;
	case R_386_PC8:
	case R_386_PC16:
	case R_386_PC32:
	return R_PC;
	case R_386_GOTPC:
	return R_GOTONLY_PC_FROM_END;
	case R_386_TLS_IE:
	return R_GOT;
	case R_386_GOT32:
	case R_386_GOT32X:
	// These relocations are arguably mis-designed because their calculations
	// depend on the instructions they are applied to. This is bad because we
	// usually don't care about whether the target section contains valid
	// machine instructions or not. But this is part of the documented ABI, so
	// we had to implement as the standard requires.
	//
	// x86 does not support PC-relative data access. Therefore, in order to
	// access GOT contents, a GOT address needs to be known at link-time
	// (which means non-PIC) or compilers have to emit code to get a GOT
	// address at runtime (which means code is position-independent but
	// compilers need to emit extra code for each GOT access.) This decision
	// is made at compile-time. In the latter case, compilers emit code to
	// load an GOT address to a register, which is usually %ebx.
	//
	// So, there are two ways to refer to symbol foo's GOT entry: foo@GOT or
	// foo@GOT(%reg).
	//
	// foo@GOT is not usable in PIC. If we are creating a PIC output and if we
	// find such relocation, we should report an error. foo@GOT is resolved to
	// an absolute address of foo's GOT entry, because both GOT address and
	// foo's offset are known. In other words, it's G + A.
	//
	// foo@GOT(%reg) needs to be resolved to a relative offset from a GOT to
	// foo's GOT entry in the table, because GOT address is not known but foo's
	// offset in the table is known. It's G + A - GOT.
	//
	// It's unfortunate that compilers emit the same relocation for these
	// different use cases. In order to distinguish them, we have to read a
	// machine instruction.
	//
	// The following code implements it. We assume that Loc[0] is the first
	// byte of a displacement or an immediate field of a valid machine
	// instruction. That means a ModRM byte is at Loc[-1]. By taking a look at
	// the byte, we can determine whether the instruction is register-relative
	// (i.e. it was generated for foo@GOT(%reg)) or absolute (i.e. foo@GOT).
	return hasBaseReg(Loc[-1]) ? R_GOT_FROM_END : R_GOT;
	case R_386_TLS_GOTIE:
	return R_GOT_FROM_END;
	case R_386_GOTOFF:
	return R_GOTREL_FROM_END;
	case R_386_TLS_LE:
	return R_TLS;
	case R_386_TLS_LE_32:
	return R_NEG_TLS;
	case R_386_NONE:
	return R_NONE;
	default:
	return R_INVALID;
	}
	}

	RelExpr X86::adjustRelaxExpr(RelType Type, const uint8_t *Data,
	RelExpr Expr) const {
	switch (Expr) {
	default:
	return Expr;
	case R_RELAX_TLS_GD_TO_IE:
	return R_RELAX_TLS_GD_TO_IE_END;
	case R_RELAX_TLS_GD_TO_LE:
	return R_RELAX_TLS_GD_TO_LE_NEG;
	}
	}

	void X86::writeGotPltHeader(uint8_t *Buf) const {
	write32le(Buf, InX::Dynamic->getVA());
	}

	void X86::writeGotPlt(uint8_t *Buf, const Symbol &S) const {
	// Entries in .got.plt initially points back to the corresponding
	// PLT entries with a fixed offset to skip the first instruction.
	write32le(Buf, S.getPltVA() + 6);
	}

	void X86::writeIgotPlt(uint8_t *Buf, const Symbol &S) const {
	// An x86 entry is the address of the ifunc resolver function.
	write32le(Buf, S.getVA());
	}

	RelType X86::getDynRel(RelType Type) const {
	if (Type == R_386_TLS_LE)
	return R_386_TLS_TPOFF;
	if (Type == R_386_TLS_LE_32)
	return R_386_TLS_TPOFF32;
	return Type;
	}

	void X86::writePltHeader(uint8_t *Buf) const {
	if (Config->Pic) {
	const uint8_t V[] = {
	0xff, 0xb3, 0x04, 0x00, 0x00, 0x00, // pushl GOTPLT+4(%ebx)
	0xff, 0xa3, 0x08, 0x00, 0x00, 0x00, // jmp *GOTPLT+8(%ebx)
	0x90, 0x90, 0x90, 0x90 // nop
	};
	memcpy(Buf, V, sizeof(V));

	uint32_t Ebx = InX::Got->getVA() + InX::Got->getSize();
	uint32_t GotPlt = InX::GotPlt->getVA() - Ebx;
	write32le(Buf + 2, GotPlt + 4);
	write32le(Buf + 8, GotPlt + 8);
	return;
	}

	const uint8_t PltData[] = {
	0xff, 0x35, 0, 0, 0, 0, // pushl (GOTPLT+4)
	0xff, 0x25, 0, 0, 0, 0, // jmp *(GOTPLT+8)
	0x90, 0x90, 0x90, 0x90, // nop
	};
	memcpy(Buf, PltData, sizeof(PltData));
	uint32_t GotPlt = InX::GotPlt->getVA();
	write32le(Buf + 2, GotPlt + 4);
	write32le(Buf + 8, GotPlt + 8);
	}

	void X86::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr,
	uint64_t PltEntryAddr, int32_t Index,
	unsigned RelOff) const {
	const uint8_t Inst[] = {
	0xff, 0x00, 0, 0, 0, 0, // jmp foo_in_GOT or jmp foo@GOT(%ebx)
	0x68, 0, 0, 0, 0, // pushl $reloc_offset
	0xe9, 0, 0, 0, 0, // jmp .PLT0@PC
	};
	memcpy(Buf, Inst, sizeof(Inst));

	if (Config->Pic) {
	// jmp *foo@GOT(%ebx)
	uint32_t Ebx = InX::Got->getVA() + InX::Got->getSize();
	Buf[1] = 0xa3;
	write32le(Buf + 2, GotPltEntryAddr - Ebx);
	} else {
	// jmp *foo_in_GOT
	Buf[1] = 0x25;
	write32le(Buf + 2, GotPltEntryAddr);
	}

	write32le(Buf + 7, RelOff);
	write32le(Buf + 12, -Index * PltEntrySize - PltHeaderSize - 16);
	}

	int64_t X86::getImplicitAddend(const uint8_t *Buf, RelType Type) const {
	switch (Type) {
	case R_386_8:
	case R_386_PC8:
	return SignExtend64<8>(*Buf);
	case R_386_16:
	case R_386_PC16:
	return SignExtend64<16>(read16le(Buf));
	case R_386_32:
	case R_386_GOT32:
	case R_386_GOT32X:
	case R_386_GOTOFF:
	case R_386_GOTPC:
	case R_386_PC32:
	case R_386_PLT32:
	case R_386_TLS_LDO_32:
	case R_386_TLS_LE:
	return SignExtend64<32>(read32le(Buf));
	default:
	return 0;
	}
	}

	void X86::relocateOne(uint8_t *Loc, RelType Type, uint64_t Val) const {
	switch (Type) {
	case R_386_8:
	// R_386_{PC,}{8,16} are not part of the i386 psABI, but they are
	// being used for some 16-bit programs such as boot loaders, so
	// we want to support them.
	checkUInt<8>(Loc, Val, Type);
	*Loc = Val;
	break;
	case R_386_PC8:
	checkInt<8>(Loc, Val, Type);
	*Loc = Val;
	break;
	case R_386_16:
	checkUInt<16>(Loc, Val, Type);
	write16le(Loc, Val);
	break;
	case R_386_PC16:
	// R_386_PC16 is normally used with 16 bit code. In that situation
	// the PC is 16 bits, just like the addend. This means that it can
	// point from any 16 bit address to any other if the possibility
	// of wrapping is included.
	// The only restriction we have to check then is that the destination
	// address fits in 16 bits. That is impossible to do here. The problem is
	// that we are passed the final value, which already had the
	// current location subtracted from it.
	// We just check that Val fits in 17 bits. This misses some cases, but
	// should have no false positives.
	checkInt<17>(Loc, Val, Type);
	write16le(Loc, Val);
	break;
	case R_386_32:
	case R_386_GLOB_DAT:
	case R_386_GOT32:
	case R_386_GOT32X:
	case R_386_GOTOFF:
	case R_386_GOTPC:
	case R_386_PC32:
	case R_386_PLT32:
	case R_386_RELATIVE:
	case R_386_TLS_DTPMOD32:
	case R_386_TLS_DTPOFF32:
	case R_386_TLS_GD:
	case R_386_TLS_GOTIE:
	case R_386_TLS_IE:
	case R_386_TLS_LDM:
	case R_386_TLS_LDO_32:
	case R_386_TLS_LE:
	case R_386_TLS_LE_32:
	case R_386_TLS_TPOFF:
	case R_386_TLS_TPOFF32:
	checkInt<32>(Loc, Val, Type);
	write32le(Loc, Val);
	break;
	default:
	error(getErrorLocation(Loc) + "unrecognized reloc " + Twine(Type));
	}
	}

	void X86::relaxTlsGdToLe(uint8_t *Loc, RelType Type, uint64_t Val) const {
	// Convert
	// leal x@tlsgd(, %ebx, 1),
	// call __tls_get_addr@plt
	// to
	// movl %gs:0,%eax
	// subl $x@ntpoff,%eax
	const uint8_t Inst[] = {
	0x65, 0xa1, 0x00, 0x00, 0x00, 0x00, // movl %gs:0, %eax
	0x81, 0xe8, 0, 0, 0, 0, // subl Val(%ebx), %eax
	};
	memcpy(Loc - 3, Inst, sizeof(Inst));
	write32le(Loc + 5, Val);
	}

	void X86::relaxTlsGdToIe(uint8_t *Loc, RelType Type, uint64_t Val) const {
	// Convert
	// leal x@tlsgd(, %ebx, 1),
	// call __tls_get_addr@plt
	// to
	// movl %gs:0, %eax
	// addl x@gotntpoff(%ebx), %eax
	const uint8_t Inst[] = {
	0x65, 0xa1, 0x00, 0x00, 0x00, 0x00, // movl %gs:0, %eax
	0x03, 0x83, 0, 0, 0, 0, // addl Val(%ebx), %eax
	};
	memcpy(Loc - 3, Inst, sizeof(Inst));
	write32le(Loc + 5, Val);
	}

	// In some conditions, relocations can be optimized to avoid using GOT.
	// This function does that for Initial Exec to Local Exec case.
	void X86::relaxTlsIeToLe(uint8_t *Loc, RelType Type, uint64_t Val) const {
	// Ulrich's document section 6.2 says that @gotntpoff can
	// be used with MOVL or ADDL instructions.
	// @indntpoff is similar to @gotntpoff, but for use in
	// position dependent code.
	uint8_t Reg = (Loc[-1] >> 3) & 7;

	if (Type == R_386_TLS_IE) {
	if (Loc[-1] == 0xa1) {
	// "movl foo@indntpoff,%eax" -> "movl $foo,%eax"
	// This case is different from the generic case below because
	// this is a 5 byte instruction while below is 6 bytes.
	Loc[-1] = 0xb8;
	} else if (Loc[-2] == 0x8b) {
	// "movl foo@indntpoff,%reg" -> "movl $foo,%reg"
	Loc[-2] = 0xc7;
	Loc[-1] = 0xc0 \| Reg;
	} else {
	// "addl foo@indntpoff,%reg" -> "addl $foo,%reg"
	Loc[-2] = 0x81;
	Loc[-1] = 0xc0 \| Reg;
	}
	} else {
	assert(Type == R_386_TLS_GOTIE);
	if (Loc[-2] == 0x8b) {
	// "movl foo@gottpoff(%rip),%reg" -> "movl $foo,%reg"
	Loc[-2] = 0xc7;
	Loc[-1] = 0xc0 \| Reg;
	} else {
	// "addl foo@gotntpoff(%rip),%reg" -> "leal foo(%reg),%reg"
	Loc[-2] = 0x8d;
	Loc[-1] = 0x80 \| (Reg << 3) \| Reg;
	}
	}
	write32le(Loc, Val);
	}

	void X86::relaxTlsLdToLe(uint8_t *Loc, RelType Type, uint64_t Val) const {
	if (Type == R_386_TLS_LDO_32) {
	write32le(Loc, Val);
	return;
	}

	// Convert
	// leal foo(%reg),%eax
	// call ___tls_get_addr
	// to
	// movl %gs:0,%eax
	// nop
	// leal 0(%esi,1),%esi
	const uint8_t Inst[] = {
	0x65, 0xa1, 0x00, 0x00, 0x00, 0x00, // movl %gs:0,%eax
	0x90, // nop
	0x8d, 0x74, 0x26, 0x00, // leal 0(%esi,1),%esi
	};
	memcpy(Loc - 2, Inst, sizeof(Inst));
	}

	+namespace {
	+class RetpolinePic : public X86 {
	+public:
	+ RetpolinePic();
	+ void writeGotPlt(uint8_t *Buf, const Symbol &S) const override;
	+ void writePltHeader(uint8_t *Buf) const override;
	+ void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr,
	+ int32_t Index, unsigned RelOff) const override;
	+};
	+
	+class RetpolineNoPic : public X86 {
	+public:
	+ RetpolineNoPic();
	+ void writeGotPlt(uint8_t *Buf, const Symbol &S) const override;
	+ void writePltHeader(uint8_t *Buf) const override;
	+ void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr,
	+ int32_t Index, unsigned RelOff) const override;
	+};
	+} // namespace
	+
	+RetpolinePic::RetpolinePic() {
	+ PltHeaderSize = 48;
	+ PltEntrySize = 32;
	+}
	+
	+void RetpolinePic::writeGotPlt(uint8_t *Buf, const Symbol &S) const {
	+ write32le(Buf, S.getPltVA() + 17);
	+}
	+
	+void RetpolinePic::writePltHeader(uint8_t *Buf) const {
	+ const uint8_t Insn[] = {
	+ 0xff, 0xb3, 0, 0, 0, 0, // 0: pushl GOTPLT+4(%ebx)
	+ 0x50, // 6: pushl %eax
	+ 0x8b, 0x83, 0, 0, 0, 0, // 7: mov GOTPLT+8(%ebx), %eax
	+ 0xe8, 0x0e, 0x00, 0x00, 0x00, // d: call next
	+ 0xf3, 0x90, // 12: loop: pause
	+ 0x0f, 0xae, 0xe8, // 14: lfence
	+ 0xeb, 0xf9, // 17: jmp loop
	+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 19: int3; .align 16
	+ 0x89, 0x0c, 0x24, // 20: next: mov %ecx, (%esp)
	+ 0x8b, 0x4c, 0x24, 0x04, // 23: mov 0x4(%esp), %ecx
	+ 0x89, 0x44, 0x24, 0x04, // 27: mov %eax ,0x4(%esp)
	+ 0x89, 0xc8, // 2b: mov %ecx, %eax
	+ 0x59, // 2d: pop %ecx
	+ 0xc3, // 2e: ret
	+ };
	+ memcpy(Buf, Insn, sizeof(Insn));
	+
	+ uint32_t Ebx = InX::Got->getVA() + InX::Got->getSize();
	+ uint32_t GotPlt = InX::GotPlt->getVA() - Ebx;
	+ write32le(Buf + 2, GotPlt + 4);
	+ write32le(Buf + 9, GotPlt + 8);
	+}
	+
	+void RetpolinePic::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr,
	+ uint64_t PltEntryAddr, int32_t Index,
	+ unsigned RelOff) const {
	+ const uint8_t Insn[] = {
	+ 0x50, // pushl %eax
	+ 0x8b, 0x83, 0, 0, 0, 0, // mov foo@GOT(%ebx), %eax
	+ 0xe8, 0, 0, 0, 0, // call plt+0x20
	+ 0xe9, 0, 0, 0, 0, // jmp plt+0x12
	+ 0x68, 0, 0, 0, 0, // pushl $reloc_offset
	+ 0xe9, 0, 0, 0, 0, // jmp plt+0
	+ };
	+ memcpy(Buf, Insn, sizeof(Insn));
	+
	+ uint32_t Ebx = InX::Got->getVA() + InX::Got->getSize();
	+ write32le(Buf + 3, GotPltEntryAddr - Ebx);
	+ write32le(Buf + 8, -Index * PltEntrySize - PltHeaderSize - 12 + 32);
	+ write32le(Buf + 13, -Index * PltEntrySize - PltHeaderSize - 17 + 18);
	+ write32le(Buf + 18, RelOff);
	+ write32le(Buf + 23, -Index * PltEntrySize - PltHeaderSize - 27);
	+}
	+
	+RetpolineNoPic::RetpolineNoPic() {
	+ PltHeaderSize = 48;
	+ PltEntrySize = 32;
	+}
	+
	+void RetpolineNoPic::writeGotPlt(uint8_t *Buf, const Symbol &S) const {
	+ write32le(Buf, S.getPltVA() + 16);
	+}
	+
	+void RetpolineNoPic::writePltHeader(uint8_t *Buf) const {
	+ const uint8_t PltData[] = {
	+ 0xff, 0x35, 0, 0, 0, 0, // 0: pushl GOTPLT+4
	+ 0x50, // 6: pushl %eax
	+ 0xa1, 0, 0, 0, 0, // 7: mov GOTPLT+8, %eax
	+ 0xe8, 0x0f, 0x00, 0x00, 0x00, // c: call next
	+ 0xf3, 0x90, // 11: loop: pause
	+ 0x0f, 0xae, 0xe8, // 13: lfence
	+ 0xeb, 0xf9, // 16: jmp loop
	+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 18: int3
	+ 0xcc, 0xcc, 0xcc, // 1f: int3; .align 16
	+ 0x89, 0x0c, 0x24, // 20: next: mov %ecx, (%esp)
	+ 0x8b, 0x4c, 0x24, 0x04, // 23: mov 0x4(%esp), %ecx
	+ 0x89, 0x44, 0x24, 0x04, // 27: mov %eax ,0x4(%esp)
	+ 0x89, 0xc8, // 2b: mov %ecx, %eax
	+ 0x59, // 2d: pop %ecx
	+ 0xc3, // 2e: ret
	+ };
	+ memcpy(Buf, PltData, sizeof(PltData));
	+
	+ uint32_t GotPlt = InX::GotPlt->getVA();
	+ write32le(Buf + 2, GotPlt + 4);
	+ write32le(Buf + 8, GotPlt + 8);
	+}
	+
	+void RetpolineNoPic::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr,
	+ uint64_t PltEntryAddr, int32_t Index,
	+ unsigned RelOff) const {
	+ const uint8_t Insn[] = {
	+ 0x50, // 0: pushl %eax
	+ 0xa1, 0, 0, 0, 0, // 1: mov foo_in_GOT, %eax
	+ 0xe8, 0, 0, 0, 0, // 6: call plt+0x20
	+ 0xe9, 0, 0, 0, 0, // b: jmp plt+0x11
	+ 0x68, 0, 0, 0, 0, // 10: pushl $reloc_offset
	+ 0xe9, 0, 0, 0, 0, // 15: jmp plt+0
	+ };
	+ memcpy(Buf, Insn, sizeof(Insn));
	+
	+ write32le(Buf + 2, GotPltEntryAddr);
	+ write32le(Buf + 7, -Index * PltEntrySize - PltHeaderSize - 11 + 32);
	+ write32le(Buf + 12, -Index * PltEntrySize - PltHeaderSize - 16 + 17);
	+ write32le(Buf + 17, RelOff);
	+ write32le(Buf + 22, -Index * PltEntrySize - PltHeaderSize - 26);
	+}
	+
	TargetInfo *elf::getX86TargetInfo() {
	- static X86 Target;
	- return &Target;
	+ if (Config->ZRetpolineplt) {
	+ if (Config->Pic) {
	+ static RetpolinePic T;
	+ return &T;
	+ }
	+ static RetpolineNoPic T;
	+ return &T;
	+ }
	+
	+ static X86 T;
	+ return &T;
	}
	Index: head/contrib/llvm/tools/lld/ELF/Arch/X86_64.cpp
	===================================================================
	--- head/contrib/llvm/tools/lld/ELF/Arch/X86_64.cpp (revision 328816)
	+++ head/contrib/llvm/tools/lld/ELF/Arch/X86_64.cpp (revision 328817)
	@@ -1,471 +1,584 @@
	//===- X86_64.cpp ---------------------------------------------------------===//
	//
	// The LLVM Linker
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "InputFiles.h"
	#include "Symbols.h"
	#include "SyntheticSections.h"
	#include "Target.h"
	#include "lld/Common/ErrorHandler.h"
	#include "llvm/Object/ELF.h"
	#include "llvm/Support/Endian.h"

	using namespace llvm;
	using namespace llvm::object;
	using namespace llvm::support::endian;
	using namespace llvm::ELF;
	using namespace lld;
	using namespace lld::elf;

	namespace {
	-template <class ELFT> class X86_64 final : public TargetInfo {
	+template <class ELFT> class X86_64 : public TargetInfo {
	public:
	X86_64();
	RelExpr getRelExpr(RelType Type, const Symbol &S,
	const uint8_t *Loc) const override;
	bool isPicRel(RelType Type) const override;
	void writeGotPltHeader(uint8_t *Buf) const override;
	void writeGotPlt(uint8_t *Buf, const Symbol &S) const override;
	void writePltHeader(uint8_t *Buf) const override;
	void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr,
	int32_t Index, unsigned RelOff) const override;
	void relocateOne(uint8_t *Loc, RelType Type, uint64_t Val) const override;

	RelExpr adjustRelaxExpr(RelType Type, const uint8_t *Data,
	RelExpr Expr) const override;
	void relaxGot(uint8_t *Loc, uint64_t Val) const override;
	void relaxTlsGdToIe(uint8_t *Loc, RelType Type, uint64_t Val) const override;
	void relaxTlsGdToLe(uint8_t *Loc, RelType Type, uint64_t Val) const override;
	void relaxTlsIeToLe(uint8_t *Loc, RelType Type, uint64_t Val) const override;
	void relaxTlsLdToLe(uint8_t *Loc, RelType Type, uint64_t Val) const override;

	private:
	void relaxGotNoPic(uint8_t *Loc, uint64_t Val, uint8_t Op,
	uint8_t ModRm) const;
	};
	} // namespace

	template <class ELFT> X86_64<ELFT>::X86_64() {
	GotBaseSymOff = -1;
	CopyRel = R_X86_64_COPY;
	GotRel = R_X86_64_GLOB_DAT;
	PltRel = R_X86_64_JUMP_SLOT;
	RelativeRel = R_X86_64_RELATIVE;
	IRelativeRel = R_X86_64_IRELATIVE;
	TlsGotRel = R_X86_64_TPOFF64;
	TlsModuleIndexRel = R_X86_64_DTPMOD64;
	TlsOffsetRel = R_X86_64_DTPOFF64;
	GotEntrySize = 8;
	GotPltEntrySize = 8;
	PltEntrySize = 16;
	PltHeaderSize = 16;
	TlsGdRelaxSkip = 2;
	TrapInstr = 0xcccccccc; // 0xcc = INT3

	// Align to the large page size (known as a superpage or huge page).
	// FreeBSD automatically promotes large, superpage-aligned allocations.
	DefaultImageBase = 0x200000;
	}

	template <class ELFT>
	RelExpr X86_64<ELFT>::getRelExpr(RelType Type, const Symbol &S,
	const uint8_t *Loc) const {
	switch (Type) {
	case R_X86_64_8:
	case R_X86_64_16:
	case R_X86_64_32:
	case R_X86_64_32S:
	case R_X86_64_64:
	case R_X86_64_DTPOFF32:
	case R_X86_64_DTPOFF64:
	return R_ABS;
	case R_X86_64_TPOFF32:
	return R_TLS;
	case R_X86_64_TLSLD:
	return R_TLSLD_PC;
	case R_X86_64_TLSGD:
	return R_TLSGD_PC;
	case R_X86_64_SIZE32:
	case R_X86_64_SIZE64:
	return R_SIZE;
	case R_X86_64_PLT32:
	return R_PLT_PC;
	case R_X86_64_PC32:
	case R_X86_64_PC64:
	return R_PC;
	case R_X86_64_GOT32:
	case R_X86_64_GOT64:
	return R_GOT_FROM_END;
	case R_X86_64_GOTPCREL:
	case R_X86_64_GOTPCRELX:
	case R_X86_64_REX_GOTPCRELX:
	case R_X86_64_GOTTPOFF:
	return R_GOT_PC;
	case R_X86_64_NONE:
	return R_NONE;
	default:
	return R_INVALID;
	}
	}

	template <class ELFT> void X86_64<ELFT>::writeGotPltHeader(uint8_t *Buf) const {
	// The first entry holds the value of _DYNAMIC. It is not clear why that is
	// required, but it is documented in the psabi and the glibc dynamic linker
	// seems to use it (note that this is relevant for linking ld.so, not any
	// other program).
	write64le(Buf, InX::Dynamic->getVA());
	}

	template <class ELFT>
	void X86_64<ELFT>::writeGotPlt(uint8_t *Buf, const Symbol &S) const {
	// See comments in X86::writeGotPlt.
	write32le(Buf, S.getPltVA() + 6);
	}

	template <class ELFT> void X86_64<ELFT>::writePltHeader(uint8_t *Buf) const {
	const uint8_t PltData[] = {
	0xff, 0x35, 0, 0, 0, 0, // pushq GOTPLT+8(%rip)
	0xff, 0x25, 0, 0, 0, 0, // jmp *GOTPLT+16(%rip)
	0x0f, 0x1f, 0x40, 0x00, // nop
	};
	memcpy(Buf, PltData, sizeof(PltData));
	uint64_t GotPlt = InX::GotPlt->getVA();
	uint64_t Plt = InX::Plt->getVA();
	write32le(Buf + 2, GotPlt - Plt + 2); // GOTPLT+8
	write32le(Buf + 8, GotPlt - Plt + 4); // GOTPLT+16
	}

	template <class ELFT>
	void X86_64<ELFT>::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr,
	uint64_t PltEntryAddr, int32_t Index,
	unsigned RelOff) const {
	const uint8_t Inst[] = {
	0xff, 0x25, 0, 0, 0, 0, // jmpq *got(%rip)
	0x68, 0, 0, 0, 0, // pushq <relocation index>
	0xe9, 0, 0, 0, 0, // jmpq plt[0]
	};
	memcpy(Buf, Inst, sizeof(Inst));

	write32le(Buf + 2, GotPltEntryAddr - PltEntryAddr - 6);
	write32le(Buf + 7, Index);
	write32le(Buf + 12, -Index * PltEntrySize - PltHeaderSize - 16);
	}

	template <class ELFT> bool X86_64<ELFT>::isPicRel(RelType Type) const {
	return Type != R_X86_64_PC32 && Type != R_X86_64_32 &&
	Type != R_X86_64_TPOFF32;
	}

	template <class ELFT>
	void X86_64<ELFT>::relaxTlsGdToLe(uint8_t *Loc, RelType Type,
	uint64_t Val) const {
	// Convert
	// .byte 0x66
	// leaq x@tlsgd(%rip), %rdi
	// .word 0x6666
	// rex64
	// call __tls_get_addr@plt
	// to
	// mov %fs:0x0,%rax
	// lea x@tpoff,%rax
	const uint8_t Inst[] = {
	0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, // mov %fs:0x0,%rax
	0x48, 0x8d, 0x80, 0, 0, 0, 0, // lea x@tpoff,%rax
	};
	memcpy(Loc - 4, Inst, sizeof(Inst));

	// The original code used a pc relative relocation and so we have to
	// compensate for the -4 in had in the addend.
	write32le(Loc + 8, Val + 4);
	}

	template <class ELFT>
	void X86_64<ELFT>::relaxTlsGdToIe(uint8_t *Loc, RelType Type,
	uint64_t Val) const {
	// Convert
	// .byte 0x66
	// leaq x@tlsgd(%rip), %rdi
	// .word 0x6666
	// rex64
	// call __tls_get_addr@plt
	// to
	// mov %fs:0x0,%rax
	// addq x@tpoff,%rax
	const uint8_t Inst[] = {
	0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, // mov %fs:0x0,%rax
	0x48, 0x03, 0x05, 0, 0, 0, 0, // addq x@tpoff,%rax
	};
	memcpy(Loc - 4, Inst, sizeof(Inst));

	// Both code sequences are PC relatives, but since we are moving the constant
	// forward by 8 bytes we have to subtract the value by 8.
	write32le(Loc + 8, Val - 8);
	}

	// In some conditions, R_X86_64_GOTTPOFF relocation can be optimized to
	// R_X86_64_TPOFF32 so that it does not use GOT.
	template <class ELFT>
	void X86_64<ELFT>::relaxTlsIeToLe(uint8_t *Loc, RelType Type,
	uint64_t Val) const {
	uint8_t *Inst = Loc - 3;
	uint8_t Reg = Loc[-1] >> 3;
	uint8_t *RegSlot = Loc - 1;

	// Note that ADD with RSP or R12 is converted to ADD instead of LEA
	// because LEA with these registers needs 4 bytes to encode and thus
	// wouldn't fit the space.

	if (memcmp(Inst, "\x48\x03\x25", 3) == 0) {
	// "addq foo@gottpoff(%rip),%rsp" -> "addq $foo,%rsp"
	memcpy(Inst, "\x48\x81\xc4", 3);
	} else if (memcmp(Inst, "\x4c\x03\x25", 3) == 0) {
	// "addq foo@gottpoff(%rip),%r12" -> "addq $foo,%r12"
	memcpy(Inst, "\x49\x81\xc4", 3);
	} else if (memcmp(Inst, "\x4c\x03", 2) == 0) {
	// "addq foo@gottpoff(%rip),%r[8-15]" -> "leaq foo(%r[8-15]),%r[8-15]"
	memcpy(Inst, "\x4d\x8d", 2);
	*RegSlot = 0x80 \| (Reg << 3) \| Reg;
	} else if (memcmp(Inst, "\x48\x03", 2) == 0) {
	// "addq foo@gottpoff(%rip),%reg -> "leaq foo(%reg),%reg"
	memcpy(Inst, "\x48\x8d", 2);
	*RegSlot = 0x80 \| (Reg << 3) \| Reg;
	} else if (memcmp(Inst, "\x4c\x8b", 2) == 0) {
	// "movq foo@gottpoff(%rip),%r[8-15]" -> "movq $foo,%r[8-15]"
	memcpy(Inst, "\x49\xc7", 2);
	*RegSlot = 0xc0 \| Reg;
	} else if (memcmp(Inst, "\x48\x8b", 2) == 0) {
	// "movq foo@gottpoff(%rip),%reg" -> "movq $foo,%reg"
	memcpy(Inst, "\x48\xc7", 2);
	*RegSlot = 0xc0 \| Reg;
	} else {
	error(getErrorLocation(Loc - 3) +
	"R_X86_64_GOTTPOFF must be used in MOVQ or ADDQ instructions only");
	}

	// The original code used a PC relative relocation.
	// Need to compensate for the -4 it had in the addend.
	write32le(Loc, Val + 4);
	}

	template <class ELFT>
	void X86_64<ELFT>::relaxTlsLdToLe(uint8_t *Loc, RelType Type,
	uint64_t Val) const {
	// Convert
	// leaq bar@tlsld(%rip), %rdi
	// callq __tls_get_addr@PLT
	// leaq bar@dtpoff(%rax), %rcx
	// to
	// .word 0x6666
	// .byte 0x66
	// mov %fs:0,%rax
	// leaq bar@tpoff(%rax), %rcx
	if (Type == R_X86_64_DTPOFF64) {
	write64le(Loc, Val);
	return;
	}
	if (Type == R_X86_64_DTPOFF32) {
	write32le(Loc, Val);
	return;
	}

	const uint8_t Inst[] = {
	0x66, 0x66, // .word 0x6666
	0x66, // .byte 0x66
	0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, // mov %fs:0,%rax
	};
	memcpy(Loc - 3, Inst, sizeof(Inst));
	}

	template <class ELFT>
	void X86_64<ELFT>::relocateOne(uint8_t *Loc, RelType Type, uint64_t Val) const {
	switch (Type) {
	case R_X86_64_8:
	checkUInt<8>(Loc, Val, Type);
	*Loc = Val;
	break;
	case R_X86_64_16:
	checkUInt<16>(Loc, Val, Type);
	write16le(Loc, Val);
	break;
	case R_X86_64_32:
	checkUInt<32>(Loc, Val, Type);
	write32le(Loc, Val);
	break;
	case R_X86_64_32S:
	case R_X86_64_TPOFF32:
	case R_X86_64_GOT32:
	case R_X86_64_GOTPCREL:
	case R_X86_64_GOTPCRELX:
	case R_X86_64_REX_GOTPCRELX:
	case R_X86_64_PC32:
	case R_X86_64_GOTTPOFF:
	case R_X86_64_PLT32:
	case R_X86_64_TLSGD:
	case R_X86_64_TLSLD:
	case R_X86_64_DTPOFF32:
	case R_X86_64_SIZE32:
	checkInt<32>(Loc, Val, Type);
	write32le(Loc, Val);
	break;
	case R_X86_64_64:
	case R_X86_64_DTPOFF64:
	case R_X86_64_GLOB_DAT:
	case R_X86_64_PC64:
	case R_X86_64_SIZE64:
	case R_X86_64_GOT64:
	write64le(Loc, Val);
	break;
	default:
	error(getErrorLocation(Loc) + "unrecognized reloc " + Twine(Type));
	}
	}

	template <class ELFT>
	RelExpr X86_64<ELFT>::adjustRelaxExpr(RelType Type, const uint8_t *Data,
	RelExpr RelExpr) const {
	if (Type != R_X86_64_GOTPCRELX && Type != R_X86_64_REX_GOTPCRELX)
	return RelExpr;
	const uint8_t Op = Data[-2];
	const uint8_t ModRm = Data[-1];

	// FIXME: When PIC is disabled and foo is defined locally in the
	// lower 32 bit address space, memory operand in mov can be converted into
	// immediate operand. Otherwise, mov must be changed to lea. We support only
	// latter relaxation at this moment.
	if (Op == 0x8b)
	return R_RELAX_GOT_PC;

	// Relax call and jmp.
	if (Op == 0xff && (ModRm == 0x15 \|\| ModRm == 0x25))
	return R_RELAX_GOT_PC;

	// Relaxation of test, adc, add, and, cmp, or, sbb, sub, xor.
	// If PIC then no relaxation is available.
	// We also don't relax test/binop instructions without REX byte,
	// they are 32bit operations and not common to have.
	assert(Type == R_X86_64_REX_GOTPCRELX);
	return Config->Pic ? RelExpr : R_RELAX_GOT_PC_NOPIC;
	}

	// A subset of relaxations can only be applied for no-PIC. This method
	// handles such relaxations. Instructions encoding information was taken from:
	// "Intel 64 and IA-32 Architectures Software Developer's Manual V2"
	// (http://www.intel.com/content/dam/www/public/us/en/documents/manuals/
	// 64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf)
	template <class ELFT>
	void X86_64<ELFT>::relaxGotNoPic(uint8_t *Loc, uint64_t Val, uint8_t Op,
	uint8_t ModRm) const {
	const uint8_t Rex = Loc[-3];
	// Convert "test %reg, foo@GOTPCREL(%rip)" to "test $foo, %reg".
	if (Op == 0x85) {
	// See "TEST-Logical Compare" (4-428 Vol. 2B),
	// TEST r/m64, r64 uses "full" ModR / M byte (no opcode extension).

	// ModR/M byte has form XX YYY ZZZ, where
	// YYY is MODRM.reg(register 2), ZZZ is MODRM.rm(register 1).
	// XX has different meanings:
	// 00: The operand's memory address is in reg1.
	// 01: The operand's memory address is reg1 + a byte-sized displacement.
	// 10: The operand's memory address is reg1 + a word-sized displacement.
	// 11: The operand is reg1 itself.
	// If an instruction requires only one operand, the unused reg2 field
	// holds extra opcode bits rather than a register code
	// 0xC0 == 11 000 000 binary.
	// 0x38 == 00 111 000 binary.
	// We transfer reg2 to reg1 here as operand.
	// See "2.1.3 ModR/M and SIB Bytes" (Vol. 2A 2-3).
	Loc[-1] = 0xc0 \| (ModRm & 0x38) >> 3; // ModR/M byte.

	// Change opcode from TEST r/m64, r64 to TEST r/m64, imm32
	// See "TEST-Logical Compare" (4-428 Vol. 2B).
	Loc[-2] = 0xf7;

	// Move R bit to the B bit in REX byte.
	// REX byte is encoded as 0100WRXB, where
	// 0100 is 4bit fixed pattern.
	// REX.W When 1, a 64-bit operand size is used. Otherwise, when 0, the
	// default operand size is used (which is 32-bit for most but not all
	// instructions).
	// REX.R This 1-bit value is an extension to the MODRM.reg field.
	// REX.X This 1-bit value is an extension to the SIB.index field.
	// REX.B This 1-bit value is an extension to the MODRM.rm field or the
	// SIB.base field.
	// See "2.2.1.2 More on REX Prefix Fields " (2-8 Vol. 2A).
	Loc[-3] = (Rex & ~0x4) \| (Rex & 0x4) >> 2;
	write32le(Loc, Val);
	return;
	}

	// If we are here then we need to relax the adc, add, and, cmp, or, sbb, sub
	// or xor operations.

	// Convert "binop foo@GOTPCREL(%rip), %reg" to "binop $foo, %reg".
	// Logic is close to one for test instruction above, but we also
	// write opcode extension here, see below for details.
	Loc[-1] = 0xc0 \| (ModRm & 0x38) >> 3 \| (Op & 0x3c); // ModR/M byte.

	// Primary opcode is 0x81, opcode extension is one of:
	// 000b = ADD, 001b is OR, 010b is ADC, 011b is SBB,
	// 100b is AND, 101b is SUB, 110b is XOR, 111b is CMP.
	// This value was wrote to MODRM.reg in a line above.
	// See "3.2 INSTRUCTIONS (A-M)" (Vol. 2A 3-15),
	// "INSTRUCTION SET REFERENCE, N-Z" (Vol. 2B 4-1) for
	// descriptions about each operation.
	Loc[-2] = 0x81;
	Loc[-3] = (Rex & ~0x4) \| (Rex & 0x4) >> 2;
	write32le(Loc, Val);
	}

	template <class ELFT>
	void X86_64<ELFT>::relaxGot(uint8_t *Loc, uint64_t Val) const {
	const uint8_t Op = Loc[-2];
	const uint8_t ModRm = Loc[-1];

	// Convert "mov foo@GOTPCREL(%rip),%reg" to "lea foo(%rip),%reg".
	if (Op == 0x8b) {
	Loc[-2] = 0x8d;
	write32le(Loc, Val);
	return;
	}

	if (Op != 0xff) {
	// We are relaxing a rip relative to an absolute, so compensate
	// for the old -4 addend.
	assert(!Config->Pic);
	relaxGotNoPic(Loc, Val + 4, Op, ModRm);
	return;
	}

	// Convert call/jmp instructions.
	if (ModRm == 0x15) {
	// ABI says we can convert "call *foo@GOTPCREL(%rip)" to "nop; call foo".
	// Instead we convert to "addr32 call foo" where addr32 is an instruction
	// prefix. That makes result expression to be a single instruction.
	Loc[-2] = 0x67; // addr32 prefix
	Loc[-1] = 0xe8; // call
	write32le(Loc, Val);
	return;
	}

	// Convert "jmp *foo@GOTPCREL(%rip)" to "jmp foo; nop".
	// jmp doesn't return, so it is fine to use nop here, it is just a stub.
	assert(ModRm == 0x25);
	Loc[-2] = 0xe9; // jmp
	Loc[3] = 0x90; // nop
	write32le(Loc - 1, Val + 1);
	}

	-TargetInfo *elf::getX32TargetInfo() {
	- static X86_64<ELF32LE> Target;
	- return &Target;
	+namespace {
	+template <class ELFT> class Retpoline : public X86_64<ELFT> {
	+public:
	+ Retpoline();
	+ void writeGotPlt(uint8_t *Buf, const Symbol &S) const override;
	+ void writePltHeader(uint8_t *Buf) const override;
	+ void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr,
	+ int32_t Index, unsigned RelOff) const override;
	+};
	+
	+template <class ELFT> class RetpolineZNow : public X86_64<ELFT> {
	+public:
	+ RetpolineZNow();
	+ void writeGotPlt(uint8_t *Buf, const Symbol &S) const override {}
	+ void writePltHeader(uint8_t *Buf) const override;
	+ void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr,
	+ int32_t Index, unsigned RelOff) const override;
	+};
	+} // namespace
	+
	+template <class ELFT> Retpoline<ELFT>::Retpoline() {
	+ TargetInfo::PltHeaderSize = 48;
	+ TargetInfo::PltEntrySize = 32;
	}

	-TargetInfo *elf::getX86_64TargetInfo() {
	- static X86_64<ELF64LE> Target;
	- return &Target;
	+template <class ELFT>
	+void Retpoline<ELFT>::writeGotPlt(uint8_t *Buf, const Symbol &S) const {
	+ write32le(Buf, S.getPltVA() + 17);
	}
	+
	+template <class ELFT> void Retpoline<ELFT>::writePltHeader(uint8_t *Buf) const {
	+ const uint8_t Insn[] = {
	+ 0xff, 0x35, 0, 0, 0, 0, // 0: pushq GOTPLT+8(%rip)
	+ 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // 6: mov GOTPLT+16(%rip), %r11
	+ 0xe8, 0x0e, 0x00, 0x00, 0x00, // d: callq next
	+ 0xf3, 0x90, // 12: loop: pause
	+ 0x0f, 0xae, 0xe8, // 14: lfence
	+ 0xeb, 0xf9, // 17: jmp loop
	+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 19: int3; .align 16
	+ 0x4c, 0x89, 0x1c, 0x24, // 20: next: mov %r11, (%rsp)
	+ 0xc3, // 24: ret
	+ };
	+ memcpy(Buf, Insn, sizeof(Insn));
	+
	+ uint64_t GotPlt = InX::GotPlt->getVA();
	+ uint64_t Plt = InX::Plt->getVA();
	+ write32le(Buf + 2, GotPlt - Plt - 6 + 8);
	+ write32le(Buf + 9, GotPlt - Plt - 13 + 16);
	+}
	+
	+template <class ELFT>
	+void Retpoline<ELFT>::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr,
	+ uint64_t PltEntryAddr, int32_t Index,
	+ unsigned RelOff) const {
	+ const uint8_t Insn[] = {
	+ 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // 0: mov foo@GOTPLT(%rip), %r11
	+ 0xe8, 0, 0, 0, 0, // 7: callq plt+0x20
	+ 0xe9, 0, 0, 0, 0, // c: jmp plt+0x12
	+ 0x68, 0, 0, 0, 0, // 11: pushq <relocation index>
	+ 0xe9, 0, 0, 0, 0, // 16: jmp plt+0
	+ };
	+ memcpy(Buf, Insn, sizeof(Insn));
	+
	+ uint64_t Off = TargetInfo::PltHeaderSize + TargetInfo::PltEntrySize * Index;
	+
	+ write32le(Buf + 3, GotPltEntryAddr - PltEntryAddr - 7);
	+ write32le(Buf + 8, -Off - 12 + 32);
	+ write32le(Buf + 13, -Off - 17 + 18);
	+ write32le(Buf + 18, Index);
	+ write32le(Buf + 23, -Off - 27);
	+}
	+
	+template <class ELFT> RetpolineZNow<ELFT>::RetpolineZNow() {
	+ TargetInfo::PltHeaderSize = 32;
	+ TargetInfo::PltEntrySize = 16;
	+}
	+
	+template <class ELFT>
	+void RetpolineZNow<ELFT>::writePltHeader(uint8_t *Buf) const {
	+ const uint8_t Insn[] = {
	+ 0xe8, 0x0b, 0x00, 0x00, 0x00, // 0: call next
	+ 0xf3, 0x90, // 5: loop: pause
	+ 0x0f, 0xae, 0xe8, // 7: lfence
	+ 0xeb, 0xf9, // a: jmp loop
	+ 0xcc, 0xcc, 0xcc, 0xcc, // c: int3; .align 16
	+ 0x4c, 0x89, 0x1c, 0x24, // 10: next: mov %r11, (%rsp)
	+ 0xc3, // 14: ret
	+ };
	+ memcpy(Buf, Insn, sizeof(Insn));
	+}
	+
	+template <class ELFT>
	+void RetpolineZNow<ELFT>::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr,
	+ uint64_t PltEntryAddr, int32_t Index,
	+ unsigned RelOff) const {
	+ const uint8_t Insn[] = {
	+ 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // mov foo@GOTPLT(%rip), %r11
	+ 0xe9, 0, 0, 0, 0, // jmp plt+0
	+ };
	+ memcpy(Buf, Insn, sizeof(Insn));
	+
	+ write32le(Buf + 3, GotPltEntryAddr - PltEntryAddr - 7);
	+ write32le(Buf + 8,
	+ -Index * TargetInfo::PltEntrySize - TargetInfo::PltHeaderSize - 12);
	+}
	+
	+template <class ELFT> TargetInfo *getTargetInfo() {
	+ if (Config->ZRetpolineplt) {
	+ if (Config->ZNow) {
	+ static RetpolineZNow<ELFT> T;
	+ return &T;
	+ }
	+ static Retpoline<ELFT> T;
	+ return &T;
	+ }
	+
	+ static X86_64<ELFT> T;
	+ return &T;
	+}
	+
	+TargetInfo *elf::getX32TargetInfo() { return getTargetInfo<ELF32LE>(); }
	+TargetInfo *elf::getX86_64TargetInfo() { return getTargetInfo<ELF64LE>(); }
	Index: head/contrib/llvm/tools/lld/ELF/Config.h
	===================================================================
	--- head/contrib/llvm/tools/lld/ELF/Config.h (revision 328816)
	+++ head/contrib/llvm/tools/lld/ELF/Config.h (revision 328817)
	@@ -1,244 +1,245 @@
	//===- Config.h -------------------------------------------------- C++ --===//
	//
	// The LLVM Linker
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLD_ELF_CONFIG_H
	#define LLD_ELF_CONFIG_H

	#include "llvm/ADT/MapVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSet.h"
	#include "llvm/BinaryFormat/ELF.h"
	#include "llvm/Support/CachePruning.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/Endian.h"

	#include <vector>

	namespace lld {
	namespace elf {

	class InputFile;

	enum ELFKind {
	ELFNoneKind,
	ELF32LEKind,
	ELF32BEKind,
	ELF64LEKind,
	ELF64BEKind
	};

	// For --build-id.
	enum class BuildIdKind { None, Fast, Md5, Sha1, Hexstring, Uuid };

	// For --discard-{all,locals,none}.
	enum class DiscardPolicy { Default, All, Locals, None };

	// For --strip-{all,debug}.
	enum class StripPolicy { None, All, Debug };

	// For --unresolved-symbols.
	enum class UnresolvedPolicy { ReportError, Warn, Ignore, IgnoreAll };

	// For --orphan-handling.
	enum class OrphanHandlingPolicy { Place, Warn, Error };

	// For --sort-section and linkerscript sorting rules.
	enum class SortSectionPolicy { Default, None, Alignment, Name, Priority };

	// For --target2
	enum class Target2Policy { Abs, Rel, GotRel };

	struct SymbolVersion {
	llvm::StringRef Name;
	bool IsExternCpp;
	bool HasWildcard;
	};

	// This struct contains symbols version definition that
	// can be found in version script if it is used for link.
	struct VersionDefinition {
	llvm::StringRef Name;
	uint16_t Id = 0;
	std::vector<SymbolVersion> Globals;
	size_t NameOff = 0; // Offset in the string table
	};

	// This struct contains the global configuration for the linker.
	// Most fields are direct mapping from the command line options
	// and such fields have the same name as the corresponding options.
	// Most fields are initialized by the driver.
	struct Configuration {
	uint8_t OSABI = 0;
	llvm::CachePruningPolicy ThinLTOCachePolicy;
	llvm::StringMap<uint64_t> SectionStartMap;
	llvm::StringRef Chroot;
	llvm::StringRef DynamicLinker;
	llvm::StringRef Entry;
	llvm::StringRef Emulation;
	llvm::StringRef Fini;
	llvm::StringRef Init;
	llvm::StringRef LTOAAPipeline;
	llvm::StringRef LTONewPmPasses;
	llvm::StringRef MapFile;
	llvm::StringRef OutputFile;
	llvm::StringRef OptRemarksFilename;
	llvm::StringRef SoName;
	llvm::StringRef Sysroot;
	llvm::StringRef ThinLTOCacheDir;
	std::string Rpath;
	std::vector<VersionDefinition> VersionDefinitions;
	std::vector<llvm::StringRef> Argv;
	std::vector<llvm::StringRef> AuxiliaryList;
	std::vector<llvm::StringRef> FilterList;
	std::vector<llvm::StringRef> SearchPaths;
	std::vector<llvm::StringRef> SymbolOrderingFile;
	std::vector<llvm::StringRef> Undefined;
	std::vector<SymbolVersion> DynamicList;
	std::vector<SymbolVersion> VersionScriptGlobals;
	std::vector<SymbolVersion> VersionScriptLocals;
	std::vector<uint8_t> BuildIdVector;
	bool AllowMultipleDefinition;
	bool AndroidPackDynRelocs = false;
	bool ARMHasBlx = false;
	bool ARMHasMovtMovw = false;
	bool ARMJ1J2BranchEncoding = false;
	bool AsNeeded = false;
	bool Bsymbolic;
	bool BsymbolicFunctions;
	bool CompressDebugSections;
	bool DefineCommon;
	bool Demangle = true;
	bool DisableVerify;
	bool EhFrameHdr;
	bool EmitRelocs;
	bool EnableNewDtags;
	bool ExportDynamic;
	bool FixCortexA53Errata843419;
	bool GcSections;
	bool GdbIndex;
	bool GnuHash = false;
	bool HasDynamicList = false;
	bool HasDynSymTab;
	bool ICF;
	bool ICFData;
	bool MergeArmExidx;
	bool MipsN32Abi = false;
	bool NoGnuUnique;
	bool NoUndefinedVersion;
	bool NoinhibitExec;
	bool Nostdlib;
	bool OFormatBinary;
	bool Omagic;
	bool OptRemarksWithHotness;
	bool Pie;
	bool PrintGcSections;
	bool Relocatable;
	bool SaveTemps;
	bool SingleRoRx;
	bool Shared;
	bool Static = false;
	bool SysvHash = false;
	bool Target1Rel;
	bool Trace;
	bool Verbose;
	bool WarnCommon;
	bool WarnMissingEntry;
	bool ZCombreloc;
	bool ZExecstack;
	bool ZNocopyreloc;
	bool ZNodelete;
	bool ZNodlopen;
	bool ZNow;
	bool ZOrigin;
	bool ZRelro;
	bool ZRodynamic;
	bool ZText;
	+ bool ZRetpolineplt;
	bool ExitEarly;
	bool ZWxneeded;
	DiscardPolicy Discard;
	OrphanHandlingPolicy OrphanHandling;
	SortSectionPolicy SortSection;
	StripPolicy Strip;
	UnresolvedPolicy UnresolvedSymbols;
	Target2Policy Target2;
	BuildIdKind BuildId = BuildIdKind::None;
	ELFKind EKind = ELFNoneKind;
	uint16_t DefaultSymbolVersion = llvm::ELF::VER_NDX_GLOBAL;
	uint16_t EMachine = llvm::ELF::EM_NONE;
	llvm::Optional<uint64_t> ImageBase;
	uint64_t MaxPageSize;
	uint64_t ZStackSize;
	unsigned LTOPartitions;
	unsigned LTOO;
	unsigned Optimize;
	unsigned ThinLTOJobs;

	// The following config options do not directly correspond to any
	// particualr command line options.

	// True if we need to pass through relocations in input files to the
	// output file. Usually false because we consume relocations.
	bool CopyRelocs;

	// True if the target is ELF64. False if ELF32.
	bool Is64;

	// True if the target is little-endian. False if big-endian.
	bool IsLE;

	// endianness::little if IsLE is true. endianness::big otherwise.
	llvm::support::endianness Endianness;

	// True if the target is the little-endian MIPS64.
	//
	// The reason why we have this variable only for the MIPS is because
	// we use this often. Some ELF headers for MIPS64EL are in a
	// mixed-endian (which is horrible and I'd say that's a serious spec
	// bug), and we need to know whether we are reading MIPS ELF files or
	// not in various places.
	//
	// (Note that MIPS64EL is not a typo for MIPS64LE. This is the official
	// name whatever that means. A fun hypothesis is that "EL" is short for
	// little-endian written in the little-endian order, but I don't know
	// if that's true.)
	bool IsMips64EL;

	// Holds set of ELF header flags for the target.
	uint32_t EFlags = 0;

	// The ELF spec defines two types of relocation table entries, RELA and
	// REL. RELA is a triplet of (offset, info, addend) while REL is a
	// tuple of (offset, info). Addends for REL are implicit and read from
	// the location where the relocations are applied. So, REL is more
	// compact than RELA but requires a bit of more work to process.
	//
	// (From the linker writer's view, this distinction is not necessary.
	// If the ELF had chosen whichever and sticked with it, it would have
	// been easier to write code to process relocations, but it's too late
	// to change the spec.)
	//
	// Each ABI defines its relocation type. IsRela is true if target
	// uses RELA. As far as we know, all 64-bit ABIs are using RELA. A
	// few 32-bit ABIs are using RELA too.
	bool IsRela;

	// True if we are creating position-independent code.
	bool Pic;

	// 4 for ELF32, 8 for ELF64.
	int Wordsize;
	};

	// The only instance of Configuration struct.
	extern Configuration *Config;

	} // namespace elf
	} // namespace lld

	#endif
	Index: head/contrib/llvm/tools/lld/ELF/Driver.cpp
	===================================================================
	--- head/contrib/llvm/tools/lld/ELF/Driver.cpp (revision 328816)
	+++ head/contrib/llvm/tools/lld/ELF/Driver.cpp (revision 328817)
	@@ -1,1127 +1,1128 @@
	//===- Driver.cpp ---------------------------------------------------------===//
	//
	// The LLVM Linker
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// The driver drives the entire linking process. It is responsible for
	// parsing command line options and doing whatever it is instructed to do.
	//
	// One notable thing in the LLD's driver when compared to other linkers is
	// that the LLD's driver is agnostic on the host operating system.
	// Other linkers usually have implicit default values (such as a dynamic
	// linker path or library paths) for each host OS.
	//
	// I don't think implicit default values are useful because they are
	// usually explicitly specified by the compiler driver. They can even
	// be harmful when you are doing cross-linking. Therefore, in LLD, we
	// simply trust the compiler driver to pass all required options and
	// don't try to make effort on our side.
	//
	//===----------------------------------------------------------------------===//

	#include "Driver.h"
	#include "Config.h"
	#include "Filesystem.h"
	#include "ICF.h"
	#include "InputFiles.h"
	#include "InputSection.h"
	#include "LinkerScript.h"
	#include "OutputSections.h"
	#include "ScriptParser.h"
	#include "Strings.h"
	#include "SymbolTable.h"
	#include "Symbols.h"
	#include "SyntheticSections.h"
	#include "Target.h"
	#include "Writer.h"
	#include "lld/Common/Args.h"
	#include "lld/Common/Driver.h"
	#include "lld/Common/ErrorHandler.h"
	#include "lld/Common/Memory.h"
	#include "lld/Common/Threads.h"
	#include "lld/Common/Version.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compression.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/TarWriter.h"
	#include "llvm/Support/TargetSelect.h"
	#include "llvm/Support/raw_ostream.h"
	#include <cstdlib>
	#include <utility>

	using namespace llvm;
	using namespace llvm::ELF;
	using namespace llvm::object;
	using namespace llvm::sys;

	using namespace lld;
	using namespace lld::elf;

	Configuration *elf::Config;
	LinkerDriver *elf::Driver;

	static void setConfigs();

	bool elf::link(ArrayRef<const char *> Args, bool CanExitEarly,
	raw_ostream &Error) {
	errorHandler().LogName = Args[0];
	errorHandler().ErrorLimitExceededMsg =
	"too many errors emitted, stopping now (use "
	"-error-limit=0 to see all errors)";
	errorHandler().ErrorOS = &Error;
	errorHandler().ColorDiagnostics = Error.has_colors();
	InputSections.clear();
	OutputSections.clear();
	Tar = nullptr;
	BinaryFiles.clear();
	BitcodeFiles.clear();
	ObjectFiles.clear();
	SharedFiles.clear();

	Config = make<Configuration>();
	Driver = make<LinkerDriver>();
	Script = make<LinkerScript>();
	Symtab = make<SymbolTable>();
	Config->Argv = {Args.begin(), Args.end()};

	Driver->main(Args, CanExitEarly);

	// Exit immediately if we don't need to return to the caller.
	// This saves time because the overhead of calling destructors
	// for all globally-allocated objects is not negligible.
	if (Config->ExitEarly)
	exitLld(errorCount() ? 1 : 0);

	freeArena();
	return !errorCount();
	}

	// Parses a linker -m option.
	static std::tuple<ELFKind, uint16_t, uint8_t> parseEmulation(StringRef Emul) {
	uint8_t OSABI = 0;
	StringRef S = Emul;
	if (S.endswith("_fbsd")) {
	S = S.drop_back(5);
	OSABI = ELFOSABI_FREEBSD;
	}

	std::pair<ELFKind, uint16_t> Ret =
	StringSwitch<std::pair<ELFKind, uint16_t>>(S)
	.Cases("aarch64elf", "aarch64linux", {ELF64LEKind, EM_AARCH64})
	.Cases("armelf", "armelf_linux_eabi", {ELF32LEKind, EM_ARM})
	.Case("elf32_x86_64", {ELF32LEKind, EM_X86_64})
	.Cases("elf32btsmip", "elf32btsmipn32", {ELF32BEKind, EM_MIPS})
	.Cases("elf32ltsmip", "elf32ltsmipn32", {ELF32LEKind, EM_MIPS})
	.Case("elf32ppc", {ELF32BEKind, EM_PPC})
	.Case("elf64btsmip", {ELF64BEKind, EM_MIPS})
	.Case("elf64ltsmip", {ELF64LEKind, EM_MIPS})
	.Case("elf64ppc", {ELF64BEKind, EM_PPC64})
	.Cases("elf_amd64", "elf_x86_64", {ELF64LEKind, EM_X86_64})
	.Case("elf_i386", {ELF32LEKind, EM_386})
	.Case("elf_iamcu", {ELF32LEKind, EM_IAMCU})
	.Default({ELFNoneKind, EM_NONE});

	if (Ret.first == ELFNoneKind)
	error("unknown emulation: " + Emul);
	return std::make_tuple(Ret.first, Ret.second, OSABI);
	}

	// Returns slices of MB by parsing MB as an archive file.
	// Each slice consists of a member file in the archive.
	std::vector<std::pair<MemoryBufferRef, uint64_t>> static getArchiveMembers(
	MemoryBufferRef MB) {
	std::unique_ptr<Archive> File =
	CHECK(Archive::create(MB),
	MB.getBufferIdentifier() + ": failed to parse archive");

	std::vector<std::pair<MemoryBufferRef, uint64_t>> V;
	Error Err = Error::success();
	bool AddToTar = File->isThin() && Tar;
	for (const ErrorOr<Archive::Child> &COrErr : File->children(Err)) {
	Archive::Child C =
	CHECK(COrErr, MB.getBufferIdentifier() +
	": could not get the child of the archive");
	MemoryBufferRef MBRef =
	CHECK(C.getMemoryBufferRef(),
	MB.getBufferIdentifier() +
	": could not get the buffer for a child of the archive");
	if (AddToTar)
	Tar->append(relativeToRoot(check(C.getFullName())), MBRef.getBuffer());
	V.push_back(std::make_pair(MBRef, C.getChildOffset()));
	}
	if (Err)
	fatal(MB.getBufferIdentifier() + ": Archive::children failed: " +
	toString(std::move(Err)));

	// Take ownership of memory buffers created for members of thin archives.
	for (std::unique_ptr<MemoryBuffer> &MB : File->takeThinBuffers())
	make<std::unique_ptr<MemoryBuffer>>(std::move(MB));

	return V;
	}

	// Opens a file and create a file object. Path has to be resolved already.
	void LinkerDriver::addFile(StringRef Path, bool WithLOption) {
	using namespace sys::fs;

	Optional<MemoryBufferRef> Buffer = readFile(Path);
	if (!Buffer.hasValue())
	return;
	MemoryBufferRef MBRef = *Buffer;

	if (InBinary) {
	Files.push_back(make<BinaryFile>(MBRef));
	return;
	}

	switch (identify_magic(MBRef.getBuffer())) {
	case file_magic::unknown:
	readLinkerScript(MBRef);
	return;
	case file_magic::archive: {
	// Handle -whole-archive.
	if (InWholeArchive) {
	for (const auto &P : getArchiveMembers(MBRef))
	Files.push_back(createObjectFile(P.first, Path, P.second));
	return;
	}

	std::unique_ptr<Archive> File =
	CHECK(Archive::create(MBRef), Path + ": failed to parse archive");

	// If an archive file has no symbol table, it is likely that a user
	// is attempting LTO and using a default ar command that doesn't
	// understand the LLVM bitcode file. It is a pretty common error, so
	// we'll handle it as if it had a symbol table.
	if (!File->isEmpty() && !File->hasSymbolTable()) {
	for (const auto &P : getArchiveMembers(MBRef))
	Files.push_back(make<LazyObjFile>(P.first, Path, P.second));
	return;
	}

	// Handle the regular case.
	Files.push_back(make<ArchiveFile>(std::move(File)));
	return;
	}
	case file_magic::elf_shared_object:
	if (Config->Relocatable) {
	error("attempted static link of dynamic object " + Path);
	return;
	}

	// DSOs usually have DT_SONAME tags in their ELF headers, and the
	// sonames are used to identify DSOs. But if they are missing,
	// they are identified by filenames. We don't know whether the new
	// file has a DT_SONAME or not because we haven't parsed it yet.
	// Here, we set the default soname for the file because we might
	// need it later.
	//
	// If a file was specified by -lfoo, the directory part is not
	// significant, as a user did not specify it. This behavior is
	// compatible with GNU.
	Files.push_back(
	createSharedFile(MBRef, WithLOption ? path::filename(Path) : Path));
	return;
	default:
	if (InLib)
	Files.push_back(make<LazyObjFile>(MBRef, "", 0));
	else
	Files.push_back(createObjectFile(MBRef));
	}
	}

	// Add a given library by searching it from input search paths.
	void LinkerDriver::addLibrary(StringRef Name) {
	if (Optional<std::string> Path = searchLibrary(Name))
	addFile(Path, /WithLOption=*/true);
	else
	error("unable to find library -l" + Name);
	}

	// This function is called on startup. We need this for LTO since
	// LTO calls LLVM functions to compile bitcode files to native code.
	// Technically this can be delayed until we read bitcode files, but
	// we don't bother to do lazily because the initialization is fast.
	static void initLLVM(opt::InputArgList &Args) {
	InitializeAllTargets();
	InitializeAllTargetMCs();
	InitializeAllAsmPrinters();
	InitializeAllAsmParsers();

	// Parse and evaluate -mllvm options.
	std::vector<const char *> V;
	V.push_back("lld (LLVM option parsing)");
	for (auto *Arg : Args.filtered(OPT_mllvm))
	V.push_back(Arg->getValue());
	cl::ParseCommandLineOptions(V.size(), V.data());
	}

	// Some command line options or some combinations of them are not allowed.
	// This function checks for such errors.
	static void checkOptions(opt::InputArgList &Args) {
	// The MIPS ABI as of 2016 does not support the GNU-style symbol lookup
	// table which is a relatively new feature.
	if (Config->EMachine == EM_MIPS && Config->GnuHash)
	error("the .gnu.hash section is not compatible with the MIPS target.");

	if (Config->FixCortexA53Errata843419 && Config->EMachine != EM_AARCH64)
	error("--fix-cortex-a53-843419 is only supported on AArch64 targets.");

	if (Config->Pie && Config->Shared)
	error("-shared and -pie may not be used together");

	if (!Config->Shared && !Config->FilterList.empty())
	error("-F may not be used without -shared");

	if (!Config->Shared && !Config->AuxiliaryList.empty())
	error("-f may not be used without -shared");

	if (!Config->Relocatable && !Config->DefineCommon)
	error("-no-define-common not supported in non relocatable output");

	if (Config->Relocatable) {
	if (Config->Shared)
	error("-r and -shared may not be used together");
	if (Config->GcSections)
	error("-r and --gc-sections may not be used together");
	if (Config->ICF)
	error("-r and --icf may not be used together");
	if (Config->Pie)
	error("-r and -pie may not be used together");
	}
	}

	static const char *getReproduceOption(opt::InputArgList &Args) {
	if (auto *Arg = Args.getLastArg(OPT_reproduce))
	return Arg->getValue();
	return getenv("LLD_REPRODUCE");
	}

	static bool hasZOption(opt::InputArgList &Args, StringRef Key) {
	for (auto *Arg : Args.filtered(OPT_z))
	if (Key == Arg->getValue())
	return true;
	return false;
	}

	void LinkerDriver::main(ArrayRef<const char *> ArgsArr, bool CanExitEarly) {
	ELFOptTable Parser;
	opt::InputArgList Args = Parser.parse(ArgsArr.slice(1));

	// Interpret this flag early because error() depends on them.
	errorHandler().ErrorLimit = args::getInteger(Args, OPT_error_limit, 20);

	// Handle -help
	if (Args.hasArg(OPT_help)) {
	printHelp(ArgsArr[0]);
	return;
	}

	// Handle -v or -version.
	//
	// A note about "compatible with GNU linkers" message: this is a hack for
	// scripts generated by GNU Libtool 2.4.6 (released in February 2014 and
	// still the newest version in March 2017) or earlier to recognize LLD as
	// a GNU compatible linker. As long as an output for the -v option
	// contains "GNU" or "with BFD", they recognize us as GNU-compatible.
	//
	// This is somewhat ugly hack, but in reality, we had no choice other
	// than doing this. Considering the very long release cycle of Libtool,
	// it is not easy to improve it to recognize LLD as a GNU compatible
	// linker in a timely manner. Even if we can make it, there are still a
	// lot of "configure" scripts out there that are generated by old version
	// of Libtool. We cannot convince every software developer to migrate to
	// the latest version and re-generate scripts. So we have this hack.
	if (Args.hasArg(OPT_v) \|\| Args.hasArg(OPT_version))
	message(getLLDVersion() + " (compatible with GNU linkers)");

	// The behavior of -v or --version is a bit strange, but this is
	// needed for compatibility with GNU linkers.
	if (Args.hasArg(OPT_v) && !Args.hasArg(OPT_INPUT))
	return;
	if (Args.hasArg(OPT_version))
	return;

	Config->ExitEarly = CanExitEarly && !Args.hasArg(OPT_full_shutdown);
	errorHandler().ExitEarly = Config->ExitEarly;

	if (const char *Path = getReproduceOption(Args)) {
	// Note that --reproduce is a debug option so you can ignore it
	// if you are trying to understand the whole picture of the code.
	Expected<std::unique_ptr<TarWriter>> ErrOrWriter =
	TarWriter::create(Path, path::stem(Path));
	if (ErrOrWriter) {
	Tar = ErrOrWriter->get();
	Tar->append("response.txt", createResponseFile(Args));
	Tar->append("version.txt", getLLDVersion() + "\n");
	make<std::unique_ptr<TarWriter>>(std::move(*ErrOrWriter));
	} else {
	error(Twine("--reproduce: failed to open ") + Path + ": " +
	toString(ErrOrWriter.takeError()));
	}
	}

	readConfigs(Args);
	initLLVM(Args);
	createFiles(Args);
	inferMachineType();
	setConfigs();
	checkOptions(Args);
	if (errorCount())
	return;

	switch (Config->EKind) {
	case ELF32LEKind:
	link<ELF32LE>(Args);
	return;
	case ELF32BEKind:
	link<ELF32BE>(Args);
	return;
	case ELF64LEKind:
	link<ELF64LE>(Args);
	return;
	case ELF64BEKind:
	link<ELF64BE>(Args);
	return;
	default:
	llvm_unreachable("unknown Config->EKind");
	}
	}

	static std::string getRpath(opt::InputArgList &Args) {
	std::vector<StringRef> V = args::getStrings(Args, OPT_rpath);
	return llvm::join(V.begin(), V.end(), ":");
	}

	// Determines what we should do if there are remaining unresolved
	// symbols after the name resolution.
	static UnresolvedPolicy getUnresolvedSymbolPolicy(opt::InputArgList &Args) {
	if (Args.hasArg(OPT_relocatable))
	return UnresolvedPolicy::IgnoreAll;

	UnresolvedPolicy ErrorOrWarn = Args.hasFlag(OPT_error_unresolved_symbols,
	OPT_warn_unresolved_symbols, true)
	? UnresolvedPolicy::ReportError
	: UnresolvedPolicy::Warn;

	// Process the last of -unresolved-symbols, -no-undefined or -z defs.
	for (auto *Arg : llvm::reverse(Args)) {
	switch (Arg->getOption().getID()) {
	case OPT_unresolved_symbols: {
	StringRef S = Arg->getValue();
	if (S == "ignore-all" \|\| S == "ignore-in-object-files")
	return UnresolvedPolicy::Ignore;
	if (S == "ignore-in-shared-libs" \|\| S == "report-all")
	return ErrorOrWarn;
	error("unknown --unresolved-symbols value: " + S);
	continue;
	}
	case OPT_no_undefined:
	return ErrorOrWarn;
	case OPT_z:
	if (StringRef(Arg->getValue()) == "defs")
	return ErrorOrWarn;
	continue;
	}
	}

	// -shared implies -unresolved-symbols=ignore-all because missing
	// symbols are likely to be resolved at runtime using other DSOs.
	if (Config->Shared)
	return UnresolvedPolicy::Ignore;
	return ErrorOrWarn;
	}

	static Target2Policy getTarget2(opt::InputArgList &Args) {
	StringRef S = Args.getLastArgValue(OPT_target2, "got-rel");
	if (S == "rel")
	return Target2Policy::Rel;
	if (S == "abs")
	return Target2Policy::Abs;
	if (S == "got-rel")
	return Target2Policy::GotRel;
	error("unknown --target2 option: " + S);
	return Target2Policy::GotRel;
	}

	static bool isOutputFormatBinary(opt::InputArgList &Args) {
	if (auto *Arg = Args.getLastArg(OPT_oformat)) {
	StringRef S = Arg->getValue();
	if (S == "binary")
	return true;
	error("unknown --oformat value: " + S);
	}
	return false;
	}

	static DiscardPolicy getDiscard(opt::InputArgList &Args) {
	if (Args.hasArg(OPT_relocatable))
	return DiscardPolicy::None;

	auto *Arg =
	Args.getLastArg(OPT_discard_all, OPT_discard_locals, OPT_discard_none);
	if (!Arg)
	return DiscardPolicy::Default;
	if (Arg->getOption().getID() == OPT_discard_all)
	return DiscardPolicy::All;
	if (Arg->getOption().getID() == OPT_discard_locals)
	return DiscardPolicy::Locals;
	return DiscardPolicy::None;
	}

	static StringRef getDynamicLinker(opt::InputArgList &Args) {
	auto *Arg = Args.getLastArg(OPT_dynamic_linker, OPT_no_dynamic_linker);
	if (!Arg \|\| Arg->getOption().getID() == OPT_no_dynamic_linker)
	return "";
	return Arg->getValue();
	}

	static StripPolicy getStrip(opt::InputArgList &Args) {
	if (Args.hasArg(OPT_relocatable))
	return StripPolicy::None;

	auto *Arg = Args.getLastArg(OPT_strip_all, OPT_strip_debug);
	if (!Arg)
	return StripPolicy::None;
	if (Arg->getOption().getID() == OPT_strip_all)
	return StripPolicy::All;
	return StripPolicy::Debug;
	}

	static uint64_t parseSectionAddress(StringRef S, const opt::Arg &Arg) {
	uint64_t VA = 0;
	if (S.startswith("0x"))
	S = S.drop_front(2);
	if (!to_integer(S, VA, 16))
	error("invalid argument: " + toString(Arg));
	return VA;
	}

	static StringMap<uint64_t> getSectionStartMap(opt::InputArgList &Args) {
	StringMap<uint64_t> Ret;
	for (auto *Arg : Args.filtered(OPT_section_start)) {
	StringRef Name;
	StringRef Addr;
	std::tie(Name, Addr) = StringRef(Arg->getValue()).split('=');
	Ret[Name] = parseSectionAddress(Addr, *Arg);
	}

	if (auto *Arg = Args.getLastArg(OPT_Ttext))
	Ret[".text"] = parseSectionAddress(Arg->getValue(), *Arg);
	if (auto *Arg = Args.getLastArg(OPT_Tdata))
	Ret[".data"] = parseSectionAddress(Arg->getValue(), *Arg);
	if (auto *Arg = Args.getLastArg(OPT_Tbss))
	Ret[".bss"] = parseSectionAddress(Arg->getValue(), *Arg);
	return Ret;
	}

	static SortSectionPolicy getSortSection(opt::InputArgList &Args) {
	StringRef S = Args.getLastArgValue(OPT_sort_section);
	if (S == "alignment")
	return SortSectionPolicy::Alignment;
	if (S == "name")
	return SortSectionPolicy::Name;
	if (!S.empty())
	error("unknown --sort-section rule: " + S);
	return SortSectionPolicy::Default;
	}

	static OrphanHandlingPolicy getOrphanHandling(opt::InputArgList &Args) {
	StringRef S = Args.getLastArgValue(OPT_orphan_handling, "place");
	if (S == "warn")
	return OrphanHandlingPolicy::Warn;
	if (S == "error")
	return OrphanHandlingPolicy::Error;
	if (S != "place")
	error("unknown --orphan-handling mode: " + S);
	return OrphanHandlingPolicy::Place;
	}

	// Parse --build-id or --build-id=<style>. We handle "tree" as a
	// synonym for "sha1" because all our hash functions including
	// -build-id=sha1 are actually tree hashes for performance reasons.
	static std::pair<BuildIdKind, std::vector<uint8_t>>
	getBuildId(opt::InputArgList &Args) {
	auto *Arg = Args.getLastArg(OPT_build_id, OPT_build_id_eq);
	if (!Arg)
	return {BuildIdKind::None, {}};

	if (Arg->getOption().getID() == OPT_build_id)
	return {BuildIdKind::Fast, {}};

	StringRef S = Arg->getValue();
	if (S == "md5")
	return {BuildIdKind::Md5, {}};
	if (S == "sha1" \|\| S == "tree")
	return {BuildIdKind::Sha1, {}};
	if (S == "uuid")
	return {BuildIdKind::Uuid, {}};
	if (S.startswith("0x"))
	return {BuildIdKind::Hexstring, parseHex(S.substr(2))};

	if (S != "none")
	error("unknown --build-id style: " + S);
	return {BuildIdKind::None, {}};
	}

	static bool getCompressDebugSections(opt::InputArgList &Args) {
	StringRef S = Args.getLastArgValue(OPT_compress_debug_sections, "none");
	if (S == "none")
	return false;
	if (S != "zlib")
	error("unknown --compress-debug-sections value: " + S);
	if (!zlib::isAvailable())
	error("--compress-debug-sections: zlib is not available");
	return true;
	}

	static int parseInt(StringRef S, opt::Arg *Arg) {
	int V = 0;
	if (!to_integer(S, V, 10))
	error(Arg->getSpelling() + ": number expected, but got '" + S + "'");
	return V;
	}

	// Initializes Config members by the command line options.
	void LinkerDriver::readConfigs(opt::InputArgList &Args) {
	Config->AllowMultipleDefinition =
	Args.hasArg(OPT_allow_multiple_definition) \|\| hasZOption(Args, "muldefs");
	Config->AuxiliaryList = args::getStrings(Args, OPT_auxiliary);
	Config->Bsymbolic = Args.hasArg(OPT_Bsymbolic);
	Config->BsymbolicFunctions = Args.hasArg(OPT_Bsymbolic_functions);
	Config->Chroot = Args.getLastArgValue(OPT_chroot);
	Config->CompressDebugSections = getCompressDebugSections(Args);
	Config->DefineCommon = Args.hasFlag(OPT_define_common, OPT_no_define_common,
	!Args.hasArg(OPT_relocatable));
	Config->Demangle = Args.hasFlag(OPT_demangle, OPT_no_demangle, true);
	Config->DisableVerify = Args.hasArg(OPT_disable_verify);
	Config->Discard = getDiscard(Args);
	Config->DynamicLinker = getDynamicLinker(Args);
	Config->EhFrameHdr =
	Args.hasFlag(OPT_eh_frame_hdr, OPT_no_eh_frame_hdr, false);
	Config->EmitRelocs = Args.hasArg(OPT_emit_relocs);
	Config->EnableNewDtags = !Args.hasArg(OPT_disable_new_dtags);
	Config->Entry = Args.getLastArgValue(OPT_entry);
	Config->ExportDynamic =
	Args.hasFlag(OPT_export_dynamic, OPT_no_export_dynamic, false);
	errorHandler().FatalWarnings =
	Args.hasFlag(OPT_fatal_warnings, OPT_no_fatal_warnings, false);
	Config->FilterList = args::getStrings(Args, OPT_filter);
	Config->Fini = Args.getLastArgValue(OPT_fini, "_fini");
	Config->FixCortexA53Errata843419 = Args.hasArg(OPT_fix_cortex_a53_843419);
	Config->GcSections = Args.hasFlag(OPT_gc_sections, OPT_no_gc_sections, false);
	Config->GdbIndex = Args.hasFlag(OPT_gdb_index, OPT_no_gdb_index, false);
	Config->ICF = Args.hasFlag(OPT_icf_all, OPT_icf_none, false);
	Config->ICFData = Args.hasArg(OPT_icf_data);
	Config->Init = Args.getLastArgValue(OPT_init, "_init");
	Config->LTOAAPipeline = Args.getLastArgValue(OPT_lto_aa_pipeline);
	Config->LTONewPmPasses = Args.getLastArgValue(OPT_lto_newpm_passes);
	Config->LTOO = args::getInteger(Args, OPT_lto_O, 2);
	Config->LTOPartitions = args::getInteger(Args, OPT_lto_partitions, 1);
	Config->MapFile = Args.getLastArgValue(OPT_Map);
	Config->NoGnuUnique = Args.hasArg(OPT_no_gnu_unique);
	Config->MergeArmExidx =
	Args.hasFlag(OPT_merge_exidx_entries, OPT_no_merge_exidx_entries, true);
	Config->NoUndefinedVersion = Args.hasArg(OPT_no_undefined_version);
	Config->NoinhibitExec = Args.hasArg(OPT_noinhibit_exec);
	Config->Nostdlib = Args.hasArg(OPT_nostdlib);
	Config->OFormatBinary = isOutputFormatBinary(Args);
	Config->Omagic = Args.hasFlag(OPT_omagic, OPT_no_omagic, false);
	Config->OptRemarksFilename = Args.getLastArgValue(OPT_opt_remarks_filename);
	Config->OptRemarksWithHotness = Args.hasArg(OPT_opt_remarks_with_hotness);
	Config->Optimize = args::getInteger(Args, OPT_O, 1);
	Config->OrphanHandling = getOrphanHandling(Args);
	Config->OutputFile = Args.getLastArgValue(OPT_o);
	Config->Pie = Args.hasFlag(OPT_pie, OPT_nopie, false);
	Config->PrintGcSections =
	Args.hasFlag(OPT_print_gc_sections, OPT_no_print_gc_sections, false);
	Config->Rpath = getRpath(Args);
	Config->Relocatable = Args.hasArg(OPT_relocatable);
	Config->SaveTemps = Args.hasArg(OPT_save_temps);
	Config->SearchPaths = args::getStrings(Args, OPT_library_path);
	Config->SectionStartMap = getSectionStartMap(Args);
	Config->Shared = Args.hasArg(OPT_shared);
	Config->SingleRoRx = Args.hasArg(OPT_no_rosegment);
	Config->SoName = Args.getLastArgValue(OPT_soname);
	Config->SortSection = getSortSection(Args);
	Config->Strip = getStrip(Args);
	Config->Sysroot = Args.getLastArgValue(OPT_sysroot);
	Config->Target1Rel = Args.hasFlag(OPT_target1_rel, OPT_target1_abs, false);
	Config->Target2 = getTarget2(Args);
	Config->ThinLTOCacheDir = Args.getLastArgValue(OPT_thinlto_cache_dir);
	Config->ThinLTOCachePolicy = CHECK(
	parseCachePruningPolicy(Args.getLastArgValue(OPT_thinlto_cache_policy)),
	"--thinlto-cache-policy: invalid cache policy");
	Config->ThinLTOJobs = args::getInteger(Args, OPT_thinlto_jobs, -1u);
	ThreadsEnabled = Args.hasFlag(OPT_threads, OPT_no_threads, true);
	Config->Trace = Args.hasArg(OPT_trace);
	Config->Undefined = args::getStrings(Args, OPT_undefined);
	Config->UnresolvedSymbols = getUnresolvedSymbolPolicy(Args);
	Config->Verbose = Args.hasArg(OPT_verbose);
	errorHandler().Verbose = Config->Verbose;
	Config->WarnCommon = Args.hasArg(OPT_warn_common);
	Config->ZCombreloc = !hasZOption(Args, "nocombreloc");
	Config->ZExecstack = hasZOption(Args, "execstack");
	Config->ZNocopyreloc = hasZOption(Args, "nocopyreloc");
	Config->ZNodelete = hasZOption(Args, "nodelete");
	Config->ZNodlopen = hasZOption(Args, "nodlopen");
	Config->ZNow = hasZOption(Args, "now");
	Config->ZOrigin = hasZOption(Args, "origin");
	Config->ZRelro = !hasZOption(Args, "norelro");
	+ Config->ZRetpolineplt = hasZOption(Args, "retpolineplt");
	Config->ZRodynamic = hasZOption(Args, "rodynamic");
	Config->ZStackSize = args::getZOptionValue(Args, OPT_z, "stack-size", 0);
	Config->ZText = !hasZOption(Args, "notext");
	Config->ZWxneeded = hasZOption(Args, "wxneeded");

	// Parse LTO plugin-related options for compatibility with gold.
	for (auto *Arg : Args.filtered(OPT_plugin_opt, OPT_plugin_opt_eq)) {
	StringRef S = Arg->getValue();
	if (S == "disable-verify")
	Config->DisableVerify = true;
	else if (S == "save-temps")
	Config->SaveTemps = true;
	else if (S.startswith("O"))
	Config->LTOO = parseInt(S.substr(1), Arg);
	else if (S.startswith("lto-partitions="))
	Config->LTOPartitions = parseInt(S.substr(15), Arg);
	else if (S.startswith("jobs="))
	Config->ThinLTOJobs = parseInt(S.substr(5), Arg);
	else if (!S.startswith("/") && !S.startswith("-fresolution=") &&
	!S.startswith("-pass-through=") && !S.startswith("mcpu=") &&
	!S.startswith("thinlto") && S != "-function-sections" &&
	S != "-data-sections")
	error(Arg->getSpelling() + ": unknown option: " + S);
	}

	if (Config->LTOO > 3)
	error("invalid optimization level for LTO: " + Twine(Config->LTOO));
	if (Config->LTOPartitions == 0)
	error("--lto-partitions: number of threads must be > 0");
	if (Config->ThinLTOJobs == 0)
	error("--thinlto-jobs: number of threads must be > 0");

	// Parse ELF{32,64}{LE,BE} and CPU type.
	if (auto *Arg = Args.getLastArg(OPT_m)) {
	StringRef S = Arg->getValue();
	std::tie(Config->EKind, Config->EMachine, Config->OSABI) =
	parseEmulation(S);
	Config->MipsN32Abi = (S == "elf32btsmipn32" \|\| S == "elf32ltsmipn32");
	Config->Emulation = S;
	}

	// Parse -hash-style={sysv,gnu,both}.
	if (auto *Arg = Args.getLastArg(OPT_hash_style)) {
	StringRef S = Arg->getValue();
	if (S == "sysv")
	Config->SysvHash = true;
	else if (S == "gnu")
	Config->GnuHash = true;
	else if (S == "both")
	Config->SysvHash = Config->GnuHash = true;
	else
	error("unknown -hash-style: " + S);
	}

	if (Args.hasArg(OPT_print_map))
	Config->MapFile = "-";

	// --omagic is an option to create old-fashioned executables in which
	// .text segments are writable. Today, the option is still in use to
	// create special-purpose programs such as boot loaders. It doesn't
	// make sense to create PT_GNU_RELRO for such executables.
	if (Config->Omagic)
	Config->ZRelro = false;

	std::tie(Config->BuildId, Config->BuildIdVector) = getBuildId(Args);

	if (auto *Arg = Args.getLastArg(OPT_pack_dyn_relocs_eq)) {
	StringRef S = Arg->getValue();
	if (S == "android")
	Config->AndroidPackDynRelocs = true;
	else if (S != "none")
	error("unknown -pack-dyn-relocs format: " + S);
	}

	if (auto *Arg = Args.getLastArg(OPT_symbol_ordering_file))
	if (Optional<MemoryBufferRef> Buffer = readFile(Arg->getValue()))
	Config->SymbolOrderingFile = args::getLines(*Buffer);

	// If --retain-symbol-file is used, we'll keep only the symbols listed in
	// the file and discard all others.
	if (auto *Arg = Args.getLastArg(OPT_retain_symbols_file)) {
	Config->DefaultSymbolVersion = VER_NDX_LOCAL;
	if (Optional<MemoryBufferRef> Buffer = readFile(Arg->getValue()))
	for (StringRef S : args::getLines(*Buffer))
	Config->VersionScriptGlobals.push_back(
	{S, /IsExternCpp/ false, /HasWildcard/ false});
	}

	bool HasExportDynamic =
	Args.hasFlag(OPT_export_dynamic, OPT_no_export_dynamic, false);

	// Parses -dynamic-list and -export-dynamic-symbol. They make some
	// symbols private. Note that -export-dynamic takes precedence over them
	// as it says all symbols should be exported.
	if (!HasExportDynamic) {
	for (auto *Arg : Args.filtered(OPT_dynamic_list))
	if (Optional<MemoryBufferRef> Buffer = readFile(Arg->getValue()))
	readDynamicList(*Buffer);

	for (auto *Arg : Args.filtered(OPT_export_dynamic_symbol))
	Config->DynamicList.push_back(
	{Arg->getValue(), /IsExternCpp/ false, /HasWildcard/ false});
	}

	for (auto *Arg : Args.filtered(OPT_version_script))
	if (Optional<MemoryBufferRef> Buffer = readFile(Arg->getValue()))
	readVersionScript(*Buffer);
	}

	// Some Config members do not directly correspond to any particular
	// command line options, but computed based on other Config values.
	// This function initialize such members. See Config.h for the details
	// of these values.
	static void setConfigs() {
	ELFKind Kind = Config->EKind;
	uint16_t Machine = Config->EMachine;

	// There is an ILP32 ABI for x86-64, although it's not very popular.
	// It is called the x32 ABI.
	bool IsX32 = (Kind == ELF32LEKind && Machine == EM_X86_64);

	Config->CopyRelocs = (Config->Relocatable \|\| Config->EmitRelocs);
	Config->Is64 = (Kind == ELF64LEKind \|\| Kind == ELF64BEKind);
	Config->IsLE = (Kind == ELF32LEKind \|\| Kind == ELF64LEKind);
	Config->Endianness =
	Config->IsLE ? support::endianness::little : support::endianness::big;
	Config->IsMips64EL = (Kind == ELF64LEKind && Machine == EM_MIPS);
	Config->IsRela = Config->Is64 \|\| IsX32 \|\| Config->MipsN32Abi;
	Config->Pic = Config->Pie \|\| Config->Shared;
	Config->Wordsize = Config->Is64 ? 8 : 4;
	}

	// Returns a value of "-format" option.
	static bool getBinaryOption(StringRef S) {
	if (S == "binary")
	return true;
	if (S == "elf" \|\| S == "default")
	return false;
	error("unknown -format value: " + S +
	" (supported formats: elf, default, binary)");
	return false;
	}

	void LinkerDriver::createFiles(opt::InputArgList &Args) {
	for (auto *Arg : Args) {
	switch (Arg->getOption().getUnaliasedOption().getID()) {
	case OPT_library:
	addLibrary(Arg->getValue());
	break;
	case OPT_INPUT:
	addFile(Arg->getValue(), /WithLOption=/false);
	break;
	case OPT_script:
	if (Optional<std::string> Path = searchLinkerScript(Arg->getValue())) {
	if (Optional<MemoryBufferRef> MB = readFile(*Path))
	readLinkerScript(*MB);
	break;
	}
	error(Twine("cannot find linker script ") + Arg->getValue());
	break;
	case OPT_as_needed:
	Config->AsNeeded = true;
	break;
	case OPT_format:
	InBinary = getBinaryOption(Arg->getValue());
	break;
	case OPT_no_as_needed:
	Config->AsNeeded = false;
	break;
	case OPT_Bstatic:
	Config->Static = true;
	break;
	case OPT_Bdynamic:
	Config->Static = false;
	break;
	case OPT_whole_archive:
	InWholeArchive = true;
	break;
	case OPT_no_whole_archive:
	InWholeArchive = false;
	break;
	case OPT_start_lib:
	InLib = true;
	break;
	case OPT_end_lib:
	InLib = false;
	break;
	}
	}

	if (Files.empty() && errorCount() == 0)
	error("no input files");
	}

	// If -m <machine_type> was not given, infer it from object files.
	void LinkerDriver::inferMachineType() {
	if (Config->EKind != ELFNoneKind)
	return;

	for (InputFile *F : Files) {
	if (F->EKind == ELFNoneKind)
	continue;
	Config->EKind = F->EKind;
	Config->EMachine = F->EMachine;
	Config->OSABI = F->OSABI;
	Config->MipsN32Abi = Config->EMachine == EM_MIPS && isMipsN32Abi(F);
	return;
	}
	error("target emulation unknown: -m or at least one .o file required");
	}

	// Parse -z max-page-size=<value>. The default value is defined by
	// each target.
	static uint64_t getMaxPageSize(opt::InputArgList &Args) {
	uint64_t Val = args::getZOptionValue(Args, OPT_z, "max-page-size",
	Target->DefaultMaxPageSize);
	if (!isPowerOf2_64(Val))
	error("max-page-size: value isn't a power of 2");
	return Val;
	}

	// Parses -image-base option.
	static Optional<uint64_t> getImageBase(opt::InputArgList &Args) {
	// Because we are using "Config->MaxPageSize" here, this function has to be
	// called after the variable is initialized.
	auto *Arg = Args.getLastArg(OPT_image_base);
	if (!Arg)
	return None;

	StringRef S = Arg->getValue();
	uint64_t V;
	if (!to_integer(S, V)) {
	error("-image-base: number expected, but got " + S);
	return 0;
	}
	if ((V % Config->MaxPageSize) != 0)
	warn("-image-base: address isn't multiple of page size: " + S);
	return V;
	}

	// Parses `--exclude-libs=lib,lib,...`.
	// The library names may be delimited by commas or colons.
	static DenseSet<StringRef> getExcludeLibs(opt::InputArgList &Args) {
	DenseSet<StringRef> Ret;
	for (auto *Arg : Args.filtered(OPT_exclude_libs)) {
	StringRef S = Arg->getValue();
	for (;;) {
	size_t Pos = S.find_first_of(",:");
	if (Pos == StringRef::npos)
	break;
	Ret.insert(S.substr(0, Pos));
	S = S.substr(Pos + 1);
	}
	Ret.insert(S);
	}
	return Ret;
	}

	static Optional<StringRef> getArchiveName(InputFile *File) {
	if (isa<ArchiveFile>(File))
	return File->getName();
	if (!File->ArchiveName.empty())
	return File->ArchiveName;
	return None;
	}

	// Handles the -exclude-libs option. If a static library file is specified
	// by the -exclude-libs option, all public symbols from the archive become
	// private unless otherwise specified by version scripts or something.
	// A special library name "ALL" means all archive files.
	//
	// This is not a popular option, but some programs such as bionic libc use it.
	template <class ELFT>
	static void excludeLibs(opt::InputArgList &Args, ArrayRef<InputFile *> Files) {
	DenseSet<StringRef> Libs = getExcludeLibs(Args);
	bool All = Libs.count("ALL");

	for (InputFile *File : Files)
	if (Optional<StringRef> Archive = getArchiveName(File))
	if (All \|\| Libs.count(path::filename(*Archive)))
	for (Symbol *Sym : File->getSymbols())
	if (!Sym->isLocal())
	Sym->VersionId = VER_NDX_LOCAL;
	}

	// Do actual linking. Note that when this function is called,
	// all linker scripts have already been parsed.
	template <class ELFT> void LinkerDriver::link(opt::InputArgList &Args) {
	Target = getTarget();

	Config->MaxPageSize = getMaxPageSize(Args);
	Config->ImageBase = getImageBase(Args);

	// If a -hash-style option was not given, set to a default value,
	// which varies depending on the target.
	if (!Args.hasArg(OPT_hash_style)) {
	if (Config->EMachine == EM_MIPS)
	Config->SysvHash = true;
	else
	Config->SysvHash = Config->GnuHash = true;
	}

	// Default output filename is "a.out" by the Unix tradition.
	if (Config->OutputFile.empty())
	Config->OutputFile = "a.out";

	// Fail early if the output file or map file is not writable. If a user has a
	// long link, e.g. due to a large LTO link, they do not wish to run it and
	// find that it failed because there was a mistake in their command-line.
	if (auto E = tryCreateFile(Config->OutputFile))
	error("cannot open output file " + Config->OutputFile + ": " + E.message());
	if (auto E = tryCreateFile(Config->MapFile))
	error("cannot open map file " + Config->MapFile + ": " + E.message());
	if (errorCount())
	return;

	// Use default entry point name if no name was given via the command
	// line nor linker scripts. For some reason, MIPS entry point name is
	// different from others.
	Config->WarnMissingEntry =
	(!Config->Entry.empty() \|\| (!Config->Shared && !Config->Relocatable));
	if (Config->Entry.empty() && !Config->Relocatable)
	Config->Entry = (Config->EMachine == EM_MIPS) ? "__start" : "_start";

	// Handle --trace-symbol.
	for (auto *Arg : Args.filtered(OPT_trace_symbol))
	Symtab->trace(Arg->getValue());

	// Add all files to the symbol table. This will add almost all
	// symbols that we need to the symbol table.
	for (InputFile *F : Files)
	Symtab->addFile<ELFT>(F);

	// Process -defsym option.
	for (auto *Arg : Args.filtered(OPT_defsym)) {
	StringRef From;
	StringRef To;
	std::tie(From, To) = StringRef(Arg->getValue()).split('=');
	readDefsym(From, MemoryBufferRef(To, "-defsym"));
	}

	// Now that we have every file, we can decide if we will need a
	// dynamic symbol table.
	// We need one if we were asked to export dynamic symbols or if we are
	// producing a shared library.
	// We also need one if any shared libraries are used and for pie executables
	// (probably because the dynamic linker needs it).
	Config->HasDynSymTab =
	!SharedFiles.empty() \|\| Config->Pic \|\| Config->ExportDynamic;

	// Some symbols (such as __ehdr_start) are defined lazily only when there
	// are undefined symbols for them, so we add these to trigger that logic.
	for (StringRef Sym : Script->ReferencedSymbols)
	Symtab->addUndefined<ELFT>(Sym);

	// Handle the `--undefined <sym>` options.
	for (StringRef S : Config->Undefined)
	Symtab->fetchIfLazy<ELFT>(S);

	// If an entry symbol is in a static archive, pull out that file now
	// to complete the symbol table. After this, no new names except a
	// few linker-synthesized ones will be added to the symbol table.
	Symtab->fetchIfLazy<ELFT>(Config->Entry);

	// Return if there were name resolution errors.
	if (errorCount())
	return;

	// Handle undefined symbols in DSOs.
	if (!Config->Shared)
	Symtab->scanShlibUndefined<ELFT>();

	// Handle the -exclude-libs option.
	if (Args.hasArg(OPT_exclude_libs))
	excludeLibs<ELFT>(Args, Files);

	// Create ElfHeader early. We need a dummy section in
	// addReservedSymbols to mark the created symbols as not absolute.
	Out::ElfHeader = make<OutputSection>("", 0, SHF_ALLOC);
	Out::ElfHeader->Size = sizeof(typename ELFT::Ehdr);

	// We need to create some reserved symbols such as _end. Create them.
	if (!Config->Relocatable)
	addReservedSymbols();

	// Apply version scripts.
	Symtab->scanVersionScript();

	// Create wrapped symbols for -wrap option.
	for (auto *Arg : Args.filtered(OPT_wrap))
	Symtab->addSymbolWrap<ELFT>(Arg->getValue());

	Symtab->addCombinedLTOObject<ELFT>();
	if (errorCount())
	return;

	// Apply symbol renames for -wrap.
	Symtab->applySymbolWrap();

	// Now that we have a complete list of input files.
	// Beyond this point, no new files are added.
	// Aggregate all input sections into one place.
	for (InputFile *F : ObjectFiles)
	for (InputSectionBase *S : F->getSections())
	if (S && S != &InputSection::Discarded)
	InputSections.push_back(S);
	for (BinaryFile *F : BinaryFiles)
	for (InputSectionBase *S : F->getSections())
	InputSections.push_back(cast<InputSection>(S));

	// We do not want to emit debug sections if --strip-all
	// or -strip-debug are given.
	if (Config->Strip != StripPolicy::None)
	llvm::erase_if(InputSections, [](InputSectionBase *S) {
	return S->Name.startswith(".debug") \|\| S->Name.startswith(".zdebug");
	});

	Config->EFlags = Target->calcEFlags();

	if (Config->EMachine == EM_ARM) {
	// FIXME: These warnings can be removed when lld only uses these features
	// when the input objects have been compiled with an architecture that
	// supports them.
	if (Config->ARMHasBlx == false)
	warn("lld uses blx instruction, no object with architecture supporting "
	"feature detected.");
	if (Config->ARMJ1J2BranchEncoding == false)
	warn("lld uses extended branch encoding, no object with architecture "
	"supporting feature detected.");
	if (Config->ARMHasMovtMovw == false)
	warn("lld may use movt/movw, no object with architecture supporting "
	"feature detected.");
	}

	// This adds a .comment section containing a version string. We have to add it
	// before decompressAndMergeSections because the .comment section is a
	// mergeable section.
	if (!Config->Relocatable)
	InputSections.push_back(createCommentSection());

	// Do size optimizations: garbage collection, merging of SHF_MERGE sections
	// and identical code folding.
	markLive<ELFT>();
	decompressSections();
	mergeSections();
	if (Config->ICF)
	doIcf<ELFT>();

	// Write the result to the file.
	writeResult<ELFT>();
	}
	Index: head/contrib/llvm/tools/lld
	===================================================================
	--- head/contrib/llvm/tools/lld (revision 328816)
	+++ head/contrib/llvm/tools/lld (revision 328817)

	Property changes on: head/contrib/llvm/tools/lld
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/lld/dist-release_60:r328750-328794
	Index: head/contrib/llvm/tools/lldb
	===================================================================
	--- head/contrib/llvm/tools/lldb (revision 328816)
	+++ head/contrib/llvm/tools/lldb (revision 328817)

	Property changes on: head/contrib/llvm/tools/lldb
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/lldb/dist-release_60:r328751-328794
	Index: head/contrib/llvm/tools/opt/opt.cpp
	===================================================================
	--- head/contrib/llvm/tools/opt/opt.cpp (revision 328816)
	+++ head/contrib/llvm/tools/opt/opt.cpp (revision 328817)
	@@ -1,793 +1,794 @@
	//===- opt.cpp - The LLVM Modular Optimizer -------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Optimizations may be specified an arbitrary number of times on the command
	// line, They are run in the order specified.
	//
	//===----------------------------------------------------------------------===//

	#include "BreakpointPrinter.h"
	#include "NewPMDriver.h"
	#include "PassPrinters.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Analysis/CallGraph.h"
	#include "llvm/Analysis/CallGraphSCCPass.h"
	#include "llvm/Analysis/LoopPass.h"
	#include "llvm/Analysis/RegionPass.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/Bitcode/BitcodeWriterPass.h"
	#include "llvm/CodeGen/CommandFlags.def"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugInfo.h"
	#include "llvm/IR/IRPrintingPasses.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/LegacyPassManager.h"
	#include "llvm/IR/LegacyPassNameParser.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Verifier.h"
	#include "llvm/IRReader/IRReader.h"
	#include "llvm/InitializePasses.h"
	#include "llvm/LinkAllIR.h"
	#include "llvm/LinkAllPasses.h"
	#include "llvm/MC/SubtargetFeature.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/FileSystem.h"
	#include "llvm/Support/Host.h"
	#include "llvm/Support/ManagedStatic.h"
	#include "llvm/Support/PluginLoader.h"
	#include "llvm/Support/PrettyStackTrace.h"
	#include "llvm/Support/Signals.h"
	#include "llvm/Support/SourceMgr.h"
	#include "llvm/Support/SystemUtils.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Support/TargetSelect.h"
	#include "llvm/Support/ToolOutputFile.h"
	#include "llvm/Support/YAMLTraits.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Transforms/Coroutines.h"
	#include "llvm/Transforms/IPO/AlwaysInliner.h"
	#include "llvm/Transforms/IPO/PassManagerBuilder.h"
	#include "llvm/Transforms/Utils/Cloning.h"
	#include <algorithm>
	#include <memory>
	using namespace llvm;
	using namespace opt_tool;

	// The OptimizationList is automatically populated with registered Passes by the
	// PassNameParser.
	//
	static cl::list<const PassInfo*, bool, PassNameParser>
	PassList(cl::desc("Optimizations available:"));

	// This flag specifies a textual description of the optimization pass pipeline
	// to run over the module. This flag switches opt to use the new pass manager
	// infrastructure, completely disabling all of the flags specific to the old
	// pass management.
	static cl::opt<std::string> PassPipeline(
	"passes",
	cl::desc("A textual description of the pass pipeline for optimizing"),
	cl::Hidden);

	// Other command line options...
	//
	static cl::opt<std::string>
	InputFilename(cl::Positional, cl::desc("<input bitcode file>"),
	cl::init("-"), cl::value_desc("filename"));

	static cl::opt<std::string>
	OutputFilename("o", cl::desc("Override output filename"),
	cl::value_desc("filename"));

	static cl::opt<bool>
	Force("f", cl::desc("Enable binary output on terminals"));

	static cl::opt<bool>
	PrintEachXForm("p", cl::desc("Print module after each transformation"));

	static cl::opt<bool>
	NoOutput("disable-output",
	cl::desc("Do not write result bitcode file"), cl::Hidden);

	static cl::opt<bool>
	OutputAssembly("S", cl::desc("Write output as LLVM assembly"));

	static cl::opt<bool>
	OutputThinLTOBC("thinlto-bc",
	cl::desc("Write output as ThinLTO-ready bitcode"));

	static cl::opt<std::string> ThinLinkBitcodeFile(
	"thin-link-bitcode-file", cl::value_desc("filename"),
	cl::desc(
	"A file in which to write minimized bitcode for the thin link only"));

	static cl::opt<bool>
	NoVerify("disable-verify", cl::desc("Do not run the verifier"), cl::Hidden);

	static cl::opt<bool>
	VerifyEach("verify-each", cl::desc("Verify after each transform"));

	static cl::opt<bool>
	DisableDITypeMap("disable-debug-info-type-map",
	cl::desc("Don't use a uniquing type map for debug info"));

	static cl::opt<bool>
	StripDebug("strip-debug",
	cl::desc("Strip debugger symbol info from translation unit"));

	static cl::opt<bool>
	DisableInline("disable-inlining", cl::desc("Do not run the inliner pass"));

	static cl::opt<bool>
	DisableOptimizations("disable-opt",
	cl::desc("Do not run any optimization passes"));

	static cl::opt<bool>
	StandardLinkOpts("std-link-opts",
	cl::desc("Include the standard link time optimizations"));

	static cl::opt<bool>
	OptLevelO0("O0",
	cl::desc("Optimization level 0. Similar to clang -O0"));

	static cl::opt<bool>
	OptLevelO1("O1",
	cl::desc("Optimization level 1. Similar to clang -O1"));

	static cl::opt<bool>
	OptLevelO2("O2",
	cl::desc("Optimization level 2. Similar to clang -O2"));

	static cl::opt<bool>
	OptLevelOs("Os",
	cl::desc("Like -O2 with extra optimizations for size. Similar to clang -Os"));

	static cl::opt<bool>
	OptLevelOz("Oz",
	cl::desc("Like -Os but reduces code size further. Similar to clang -Oz"));

	static cl::opt<bool>
	OptLevelO3("O3",
	cl::desc("Optimization level 3. Similar to clang -O3"));

	static cl::opt<unsigned>
	CodeGenOptLevel("codegen-opt-level",
	cl::desc("Override optimization level for codegen hooks"));

	static cl::opt<std::string>
	TargetTriple("mtriple", cl::desc("Override target triple for module"));

	static cl::opt<bool>
	UnitAtATime("funit-at-a-time",
	cl::desc("Enable IPO. This corresponds to gcc's -funit-at-a-time"),
	cl::init(true));

	static cl::opt<bool>
	DisableLoopUnrolling("disable-loop-unrolling",
	cl::desc("Disable loop unrolling in all relevant passes"),
	cl::init(false));
	static cl::opt<bool>
	DisableLoopVectorization("disable-loop-vectorization",
	cl::desc("Disable the loop vectorization pass"),
	cl::init(false));

	static cl::opt<bool>
	DisableSLPVectorization("disable-slp-vectorization",
	cl::desc("Disable the slp vectorization pass"),
	cl::init(false));

	static cl::opt<bool> EmitSummaryIndex("module-summary",
	cl::desc("Emit module summary index"),
	cl::init(false));

	static cl::opt<bool> EmitModuleHash("module-hash", cl::desc("Emit module hash"),
	cl::init(false));

	static cl::opt<bool>
	DisableSimplifyLibCalls("disable-simplify-libcalls",
	cl::desc("Disable simplify-libcalls"));

	static cl::opt<bool>
	Quiet("q", cl::desc("Obsolete option"), cl::Hidden);

	static cl::alias
	QuietA("quiet", cl::desc("Alias for -q"), cl::aliasopt(Quiet));

	static cl::opt<bool>
	AnalyzeOnly("analyze", cl::desc("Only perform analysis, no optimization"));

	static cl::opt<bool>
	PrintBreakpoints("print-breakpoints-for-testing",
	cl::desc("Print select breakpoints location for testing"));

	static cl::opt<std::string> ClDataLayout("data-layout",
	cl::desc("data layout string to use"),
	cl::value_desc("layout-string"),
	cl::init(""));

	static cl::opt<bool> PreserveBitcodeUseListOrder(
	"preserve-bc-uselistorder",
	cl::desc("Preserve use-list order when writing LLVM bitcode."),
	cl::init(true), cl::Hidden);

	static cl::opt<bool> PreserveAssemblyUseListOrder(
	"preserve-ll-uselistorder",
	cl::desc("Preserve use-list order when writing LLVM assembly."),
	cl::init(false), cl::Hidden);

	static cl::opt<bool>
	RunTwice("run-twice",
	cl::desc("Run all passes twice, re-using the same pass manager."),
	cl::init(false), cl::Hidden);

	static cl::opt<bool> DiscardValueNames(
	"discard-value-names",
	cl::desc("Discard names from Value (other than GlobalValue)."),
	cl::init(false), cl::Hidden);

	static cl::opt<bool> Coroutines(
	"enable-coroutines",
	cl::desc("Enable coroutine passes."),
	cl::init(false), cl::Hidden);

	static cl::opt<bool> PassRemarksWithHotness(
	"pass-remarks-with-hotness",
	cl::desc("With PGO, include profile count in optimization remarks"),
	cl::Hidden);

	static cl::opt<unsigned> PassRemarksHotnessThreshold(
	"pass-remarks-hotness-threshold",
	cl::desc("Minimum profile count required for an optimization remark to be output"),
	cl::Hidden);

	static cl::opt<std::string>
	RemarksFilename("pass-remarks-output",
	cl::desc("YAML output filename for pass remarks"),
	cl::value_desc("filename"));

	static inline void addPass(legacy::PassManagerBase &PM, Pass *P) {
	// Add the pass to the pass manager...
	PM.add(P);

	// If we are verifying all of the intermediate steps, add the verifier...
	if (VerifyEach)
	PM.add(createVerifierPass());
	}

	/// This routine adds optimization passes based on selected optimization level,
	/// OptLevel.
	///
	/// OptLevel - Optimization Level
	static void AddOptimizationPasses(legacy::PassManagerBase &MPM,
	legacy::FunctionPassManager &FPM,
	TargetMachine *TM, unsigned OptLevel,
	unsigned SizeLevel) {
	if (!NoVerify \|\| VerifyEach)
	FPM.add(createVerifierPass()); // Verify that input is correct

	PassManagerBuilder Builder;
	Builder.OptLevel = OptLevel;
	Builder.SizeLevel = SizeLevel;

	if (DisableInline) {
	// No inlining pass
	} else if (OptLevel > 1) {
	Builder.Inliner = createFunctionInliningPass(OptLevel, SizeLevel, false);
	} else {
	Builder.Inliner = createAlwaysInlinerLegacyPass();
	}
	Builder.DisableUnitAtATime = !UnitAtATime;
	Builder.DisableUnrollLoops = (DisableLoopUnrolling.getNumOccurrences() > 0) ?
	DisableLoopUnrolling : OptLevel == 0;

	// This is final, unless there is a #pragma vectorize enable
	if (DisableLoopVectorization)
	Builder.LoopVectorize = false;
	// If option wasn't forced via cmd line (-vectorize-loops, -loop-vectorize)
	else if (!Builder.LoopVectorize)
	Builder.LoopVectorize = OptLevel > 1 && SizeLevel < 2;

	// When #pragma vectorize is on for SLP, do the same as above
	Builder.SLPVectorize =
	DisableSLPVectorization ? false : OptLevel > 1 && SizeLevel < 2;

	if (TM)
	TM->adjustPassManager(Builder);

	if (Coroutines)
	addCoroutinePassesToExtensionPoints(Builder);

	Builder.populateFunctionPassManager(FPM);
	Builder.populateModulePassManager(MPM);
	}

	static void AddStandardLinkPasses(legacy::PassManagerBase &PM) {
	PassManagerBuilder Builder;
	Builder.VerifyInput = true;
	if (DisableOptimizations)
	Builder.OptLevel = 0;

	if (!DisableInline)
	Builder.Inliner = createFunctionInliningPass();
	Builder.populateLTOPassManager(PM);
	}

	//===----------------------------------------------------------------------===//
	// CodeGen-related helper functions.
	//

	static CodeGenOpt::Level GetCodeGenOptLevel() {
	if (CodeGenOptLevel.getNumOccurrences())
	return static_cast<CodeGenOpt::Level>(unsigned(CodeGenOptLevel));
	if (OptLevelO1)
	return CodeGenOpt::Less;
	if (OptLevelO2)
	return CodeGenOpt::Default;
	if (OptLevelO3)
	return CodeGenOpt::Aggressive;
	return CodeGenOpt::None;
	}

	// Returns the TargetMachine instance or zero if no triple is provided.
	static TargetMachine* GetTargetMachine(Triple TheTriple, StringRef CPUStr,
	StringRef FeaturesStr,
	const TargetOptions &Options) {
	std::string Error;
	const Target *TheTarget = TargetRegistry::lookupTarget(MArch, TheTriple,
	Error);
	// Some modules don't specify a triple, and this is okay.
	if (!TheTarget) {
	return nullptr;
	}

	return TheTarget->createTargetMachine(TheTriple.getTriple(), CPUStr,
	FeaturesStr, Options, getRelocModel(),
	getCodeModel(), GetCodeGenOptLevel());
	}

	#ifdef LINK_POLLY_INTO_TOOLS
	namespace polly {
	void initializePollyPasses(llvm::PassRegistry &Registry);
	}
	#endif

	//===----------------------------------------------------------------------===//
	// main for opt
	//
	int main(int argc, char **argv) {
	sys::PrintStackTraceOnErrorSignal(argv[0]);
	llvm::PrettyStackTraceProgram X(argc, argv);

	// Enable debug stream buffering.
	EnableDebugBuffering = true;

	llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
	LLVMContext Context;

	InitializeAllTargets();
	InitializeAllTargetMCs();
	InitializeAllAsmPrinters();
	InitializeAllAsmParsers();

	// Initialize passes
	PassRegistry &Registry = *PassRegistry::getPassRegistry();
	initializeCore(Registry);
	initializeCoroutines(Registry);
	initializeScalarOpts(Registry);
	initializeObjCARCOpts(Registry);
	initializeVectorization(Registry);
	initializeIPO(Registry);
	initializeAnalysis(Registry);
	initializeTransformUtils(Registry);
	initializeInstCombine(Registry);
	initializeInstrumentation(Registry);
	initializeTarget(Registry);
	// For codegen passes, only passes that do IR to IR transformation are
	// supported.
	initializeExpandMemCmpPassPass(Registry);
	initializeScalarizeMaskedMemIntrinPass(Registry);
	initializeCodeGenPreparePass(Registry);
	initializeAtomicExpandPass(Registry);
	initializeRewriteSymbolsLegacyPassPass(Registry);
	initializeWinEHPreparePass(Registry);
	initializeDwarfEHPreparePass(Registry);
	initializeSafeStackLegacyPassPass(Registry);
	initializeSjLjEHPreparePass(Registry);
	initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
	initializeGlobalMergePass(Registry);
	+ initializeIndirectBrExpandPassPass(Registry);
	initializeInterleavedAccessPass(Registry);
	initializeEntryExitInstrumenterPass(Registry);
	initializePostInlineEntryExitInstrumenterPass(Registry);
	initializeUnreachableBlockElimLegacyPassPass(Registry);
	initializeExpandReductionsPass(Registry);
	initializeWriteBitcodePassPass(Registry);

	#ifdef LINK_POLLY_INTO_TOOLS
	polly::initializePollyPasses(Registry);
	#endif

	cl::ParseCommandLineOptions(argc, argv,
	"llvm .bc -> .bc modular optimizer and analysis printer\n");

	if (AnalyzeOnly && NoOutput) {
	errs() << argv[0] << ": analyze mode conflicts with no-output mode.\n";
	return 1;
	}

	SMDiagnostic Err;

	Context.setDiscardValueNames(DiscardValueNames);
	if (!DisableDITypeMap)
	Context.enableDebugTypeODRUniquing();

	if (PassRemarksWithHotness)
	Context.setDiagnosticsHotnessRequested(true);

	if (PassRemarksHotnessThreshold)
	Context.setDiagnosticsHotnessThreshold(PassRemarksHotnessThreshold);

	std::unique_ptr<ToolOutputFile> OptRemarkFile;
	if (RemarksFilename != "") {
	std::error_code EC;
	OptRemarkFile =
	llvm::make_unique<ToolOutputFile>(RemarksFilename, EC, sys::fs::F_None);
	if (EC) {
	errs() << EC.message() << '\n';
	return 1;
	}
	Context.setDiagnosticsOutputFile(
	llvm::make_unique<yaml::Output>(OptRemarkFile->os()));
	}

	// Load the input module...
	std::unique_ptr<Module> M =
	parseIRFile(InputFilename, Err, Context, !NoVerify);

	if (!M) {
	Err.print(argv[0], errs());
	return 1;
	}

	// Strip debug info before running the verifier.
	if (StripDebug)
	StripDebugInfo(*M);

	// Immediately run the verifier to catch any problems before starting up the
	// pass pipelines. Otherwise we can crash on broken code during
	// doInitialization().
	if (!NoVerify && verifyModule(*M, &errs())) {
	errs() << argv[0] << ": " << InputFilename
	<< ": error: input module is broken!\n";
	return 1;
	}

	// If we are supposed to override the target triple or data layout, do so now.
	if (!TargetTriple.empty())
	M->setTargetTriple(Triple::normalize(TargetTriple));
	if (!ClDataLayout.empty())
	M->setDataLayout(ClDataLayout);

	// Figure out what stream we are supposed to write to...
	std::unique_ptr<ToolOutputFile> Out;
	std::unique_ptr<ToolOutputFile> ThinLinkOut;
	if (NoOutput) {
	if (!OutputFilename.empty())
	errs() << "WARNING: The -o (output filename) option is ignored when\n"
	"the --disable-output option is used.\n";
	} else {
	// Default to standard output.
	if (OutputFilename.empty())
	OutputFilename = "-";

	std::error_code EC;
	Out.reset(new ToolOutputFile(OutputFilename, EC, sys::fs::F_None));
	if (EC) {
	errs() << EC.message() << '\n';
	return 1;
	}

	if (!ThinLinkBitcodeFile.empty()) {
	ThinLinkOut.reset(
	new ToolOutputFile(ThinLinkBitcodeFile, EC, sys::fs::F_None));
	if (EC) {
	errs() << EC.message() << '\n';
	return 1;
	}
	}
	}

	Triple ModuleTriple(M->getTargetTriple());
	std::string CPUStr, FeaturesStr;
	TargetMachine *Machine = nullptr;
	const TargetOptions Options = InitTargetOptionsFromCodeGenFlags();

	if (ModuleTriple.getArch()) {
	CPUStr = getCPUStr();
	FeaturesStr = getFeaturesStr();
	Machine = GetTargetMachine(ModuleTriple, CPUStr, FeaturesStr, Options);
	}

	std::unique_ptr<TargetMachine> TM(Machine);

	// Override function attributes based on CPUStr, FeaturesStr, and command line
	// flags.
	setFunctionAttributes(CPUStr, FeaturesStr, *M);

	// If the output is set to be emitted to standard out, and standard out is a
	// console, print out a warning message and refuse to do it. We don't
	// impress anyone by spewing tons of binary goo to a terminal.
	if (!Force && !NoOutput && !AnalyzeOnly && !OutputAssembly)
	if (CheckBitcodeOutputToConsole(Out->os(), !Quiet))
	NoOutput = true;

	if (PassPipeline.getNumOccurrences() > 0) {
	OutputKind OK = OK_NoOutput;
	if (!NoOutput)
	OK = OutputAssembly
	? OK_OutputAssembly
	: (OutputThinLTOBC ? OK_OutputThinLTOBitcode : OK_OutputBitcode);

	VerifierKind VK = VK_VerifyInAndOut;
	if (NoVerify)
	VK = VK_NoVerifier;
	else if (VerifyEach)
	VK = VK_VerifyEachPass;

	// The user has asked to use the new pass manager and provided a pipeline
	// string. Hand off the rest of the functionality to the new code for that
	// layer.
	return runPassPipeline(argv[0], *M, TM.get(), Out.get(), ThinLinkOut.get(),
	OptRemarkFile.get(), PassPipeline, OK, VK,
	PreserveAssemblyUseListOrder,
	PreserveBitcodeUseListOrder, EmitSummaryIndex,
	EmitModuleHash)
	? 0
	: 1;
	}

	// Create a PassManager to hold and optimize the collection of passes we are
	// about to build.
	//
	legacy::PassManager Passes;

	// Add an appropriate TargetLibraryInfo pass for the module's triple.
	TargetLibraryInfoImpl TLII(ModuleTriple);

	// The -disable-simplify-libcalls flag actually disables all builtin optzns.
	if (DisableSimplifyLibCalls)
	TLII.disableAllFunctions();
	Passes.add(new TargetLibraryInfoWrapperPass(TLII));

	// Add internal analysis passes from the target machine.
	Passes.add(createTargetTransformInfoWrapperPass(TM ? TM->getTargetIRAnalysis()
	: TargetIRAnalysis()));

	std::unique_ptr<legacy::FunctionPassManager> FPasses;
	if (OptLevelO0 \|\| OptLevelO1 \|\| OptLevelO2 \|\| OptLevelOs \|\| OptLevelOz \|\|
	OptLevelO3) {
	FPasses.reset(new legacy::FunctionPassManager(M.get()));
	FPasses->add(createTargetTransformInfoWrapperPass(
	TM ? TM->getTargetIRAnalysis() : TargetIRAnalysis()));
	}

	if (PrintBreakpoints) {
	// Default to standard output.
	if (!Out) {
	if (OutputFilename.empty())
	OutputFilename = "-";

	std::error_code EC;
	Out = llvm::make_unique<ToolOutputFile>(OutputFilename, EC,
	sys::fs::F_None);
	if (EC) {
	errs() << EC.message() << '\n';
	return 1;
	}
	}
	Passes.add(createBreakpointPrinter(Out->os()));
	NoOutput = true;
	}

	if (TM) {
	// FIXME: We should dyn_cast this when supported.
	auto &LTM = static_cast<LLVMTargetMachine &>(*TM);
	Pass *TPC = LTM.createPassConfig(Passes);
	Passes.add(TPC);
	}

	// Create a new optimization pass for each one specified on the command line
	for (unsigned i = 0; i < PassList.size(); ++i) {
	if (StandardLinkOpts &&
	StandardLinkOpts.getPosition() < PassList.getPosition(i)) {
	AddStandardLinkPasses(Passes);
	StandardLinkOpts = false;
	}

	if (OptLevelO0 && OptLevelO0.getPosition() < PassList.getPosition(i)) {
	AddOptimizationPasses(Passes, *FPasses, TM.get(), 0, 0);
	OptLevelO0 = false;
	}

	if (OptLevelO1 && OptLevelO1.getPosition() < PassList.getPosition(i)) {
	AddOptimizationPasses(Passes, *FPasses, TM.get(), 1, 0);
	OptLevelO1 = false;
	}

	if (OptLevelO2 && OptLevelO2.getPosition() < PassList.getPosition(i)) {
	AddOptimizationPasses(Passes, *FPasses, TM.get(), 2, 0);
	OptLevelO2 = false;
	}

	if (OptLevelOs && OptLevelOs.getPosition() < PassList.getPosition(i)) {
	AddOptimizationPasses(Passes, *FPasses, TM.get(), 2, 1);
	OptLevelOs = false;
	}

	if (OptLevelOz && OptLevelOz.getPosition() < PassList.getPosition(i)) {
	AddOptimizationPasses(Passes, *FPasses, TM.get(), 2, 2);
	OptLevelOz = false;
	}

	if (OptLevelO3 && OptLevelO3.getPosition() < PassList.getPosition(i)) {
	AddOptimizationPasses(Passes, *FPasses, TM.get(), 3, 0);
	OptLevelO3 = false;
	}

	const PassInfo *PassInf = PassList[i];
	Pass *P = nullptr;
	if (PassInf->getNormalCtor())
	P = PassInf->getNormalCtor()();
	else
	errs() << argv[0] << ": cannot create pass: "
	<< PassInf->getPassName() << "\n";
	if (P) {
	PassKind Kind = P->getPassKind();
	addPass(Passes, P);

	if (AnalyzeOnly) {
	switch (Kind) {
	case PT_BasicBlock:
	Passes.add(createBasicBlockPassPrinter(PassInf, Out->os(), Quiet));
	break;
	case PT_Region:
	Passes.add(createRegionPassPrinter(PassInf, Out->os(), Quiet));
	break;
	case PT_Loop:
	Passes.add(createLoopPassPrinter(PassInf, Out->os(), Quiet));
	break;
	case PT_Function:
	Passes.add(createFunctionPassPrinter(PassInf, Out->os(), Quiet));
	break;
	case PT_CallGraphSCC:
	Passes.add(createCallGraphPassPrinter(PassInf, Out->os(), Quiet));
	break;
	default:
	Passes.add(createModulePassPrinter(PassInf, Out->os(), Quiet));
	break;
	}
	}
	}

	if (PrintEachXForm)
	Passes.add(
	createPrintModulePass(errs(), "", PreserveAssemblyUseListOrder));
	}

	if (StandardLinkOpts) {
	AddStandardLinkPasses(Passes);
	StandardLinkOpts = false;
	}

	if (OptLevelO0)
	AddOptimizationPasses(Passes, *FPasses, TM.get(), 0, 0);

	if (OptLevelO1)
	AddOptimizationPasses(Passes, *FPasses, TM.get(), 1, 0);

	if (OptLevelO2)
	AddOptimizationPasses(Passes, *FPasses, TM.get(), 2, 0);

	if (OptLevelOs)
	AddOptimizationPasses(Passes, *FPasses, TM.get(), 2, 1);

	if (OptLevelOz)
	AddOptimizationPasses(Passes, *FPasses, TM.get(), 2, 2);

	if (OptLevelO3)
	AddOptimizationPasses(Passes, *FPasses, TM.get(), 3, 0);

	if (FPasses) {
	FPasses->doInitialization();
	for (Function &F : *M)
	FPasses->run(F);
	FPasses->doFinalization();
	}

	// Check that the module is well formed on completion of optimization
	if (!NoVerify && !VerifyEach)
	Passes.add(createVerifierPass());

	// In run twice mode, we want to make sure the output is bit-by-bit
	// equivalent if we run the pass manager again, so setup two buffers and
	// a stream to write to them. Note that llc does something similar and it
	// may be worth to abstract this out in the future.
	SmallVector<char, 0> Buffer;
	SmallVector<char, 0> CompileTwiceBuffer;
	std::unique_ptr<raw_svector_ostream> BOS;
	raw_ostream *OS = nullptr;

	// Write bitcode or assembly to the output as the last step...
	if (!NoOutput && !AnalyzeOnly) {
	assert(Out);
	OS = &Out->os();
	if (RunTwice) {
	BOS = make_unique<raw_svector_ostream>(Buffer);
	OS = BOS.get();
	}
	if (OutputAssembly) {
	if (EmitSummaryIndex)
	report_fatal_error("Text output is incompatible with -module-summary");
	if (EmitModuleHash)
	report_fatal_error("Text output is incompatible with -module-hash");
	Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder));
	} else if (OutputThinLTOBC)
	Passes.add(createWriteThinLTOBitcodePass(
	*OS, ThinLinkOut ? &ThinLinkOut->os() : nullptr));
	else
	Passes.add(createBitcodeWriterPass(*OS, PreserveBitcodeUseListOrder,
	EmitSummaryIndex, EmitModuleHash));
	}

	// Before executing passes, print the final values of the LLVM options.
	cl::PrintOptionValues();

	// If requested, run all passes again with the same pass manager to catch
	// bugs caused by persistent state in the passes
	if (RunTwice) {
	std::unique_ptr<Module> M2(CloneModule(M.get()));
	Passes.run(*M2);
	CompileTwiceBuffer = Buffer;
	Buffer.clear();
	}

	// Now that we have all of the passes ready, run them.
	Passes.run(*M);

	// Compare the two outputs and make sure they're the same
	if (RunTwice) {
	assert(Out);
	if (Buffer.size() != CompileTwiceBuffer.size() \|\|
	(memcmp(Buffer.data(), CompileTwiceBuffer.data(), Buffer.size()) !=
	0)) {
	errs() << "Running the pass manager twice changed the output.\n"
	"Writing the result of the second run to the specified output.\n"
	"To generate the one-run comparison binary, just run without\n"
	"the compile-twice option\n";
	Out->os() << BOS->str();
	Out->keep();
	if (OptRemarkFile)
	OptRemarkFile->keep();
	return 1;
	}
	Out->os() << BOS->str();
	}

	// Declare success.
	if (!NoOutput \|\| PrintBreakpoints)
	Out->keep();

	if (OptRemarkFile)
	OptRemarkFile->keep();

	if (ThinLinkOut)
	ThinLinkOut->keep();

	return 0;
	}
	Index: head/contrib/llvm
	===================================================================
	--- head/contrib/llvm (revision 328816)
	+++ head/contrib/llvm (revision 328817)

	Property changes on: head/contrib/llvm
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/llvm/dist-release_60:r328749-328794
	Index: head/lib/clang/include/clang/Basic/Version.inc
	===================================================================
	--- head/lib/clang/include/clang/Basic/Version.inc (revision 328816)
	+++ head/lib/clang/include/clang/Basic/Version.inc (revision 328817)
	@@ -1,11 +1,11 @@
	/* $FreeBSD$ */

	#define CLANG_VERSION 6.0.0
	#define CLANG_VERSION_STRING "6.0.0"
	#define CLANG_VERSION_MAJOR 6
	#define CLANG_VERSION_MINOR 0
	#define CLANG_VERSION_PATCHLEVEL 0

	#define CLANG_VENDOR "FreeBSD "

	-#define SVN_REVISION "323948"
	+#define SVN_REVISION "324090"
	Index: head/lib/clang/include/lld/Common/Version.inc
	===================================================================
	--- head/lib/clang/include/lld/Common/Version.inc (revision 328816)
	+++ head/lib/clang/include/lld/Common/Version.inc (revision 328817)
	@@ -1,8 +1,8 @@
	// $FreeBSD$

	#define LLD_VERSION 6.0.0
	#define LLD_VERSION_STRING "6.0.0"
	#define LLD_VERSION_MAJOR 6
	#define LLD_VERSION_MINOR 0
	-#define LLD_REVISION_STRING "323948"
	+#define LLD_REVISION_STRING "324090"
	#define LLD_REPOSITORY_STRING "FreeBSD"
	Index: head/lib/clang/include/llvm/Support/VCSRevision.h
	===================================================================
	--- head/lib/clang/include/llvm/Support/VCSRevision.h (revision 328816)
	+++ head/lib/clang/include/llvm/Support/VCSRevision.h (revision 328817)
	@@ -1,2 +1,2 @@
	/* $FreeBSD$ */
	-#define LLVM_REVISION "svn-r323948"
	+#define LLVM_REVISION "svn-r324090"
	Index: head/lib/clang/libllvm/Makefile
	===================================================================
	--- head/lib/clang/libllvm/Makefile (revision 328816)
	+++ head/lib/clang/libllvm/Makefile (revision 328817)
	@@ -1,1450 +1,1452 @@
	# $FreeBSD$

	.include <src.opts.mk>
	.include "../llvm.pre.mk"

	LIB= llvm
	INTERNALLIB=

	CFLAGS+= -I${.OBJDIR}
	.for arch in AArch64 ARM Mips PowerPC Sparc X86
	CFLAGS+= -I${LLVM_SRCS}/lib/Target/${arch}
	.endfor

	SRCDIR= lib

	SRCS_MIN+= Analysis/AliasAnalysis.cpp
	SRCS_MIN+= Analysis/AliasAnalysisEvaluator.cpp
	SRCS_MIN+= Analysis/AliasAnalysisSummary.cpp
	SRCS_MIN+= Analysis/AliasSetTracker.cpp
	SRCS_EXT+= Analysis/Analysis.cpp
	SRCS_MIN+= Analysis/AssumptionCache.cpp
	SRCS_MIN+= Analysis/BasicAliasAnalysis.cpp
	SRCS_MIN+= Analysis/BlockFrequencyInfo.cpp
	SRCS_MIN+= Analysis/BlockFrequencyInfoImpl.cpp
	SRCS_MIN+= Analysis/BranchProbabilityInfo.cpp
	SRCS_MIN+= Analysis/CFG.cpp
	SRCS_MIN+= Analysis/CFGPrinter.cpp
	SRCS_MIN+= Analysis/CFLAndersAliasAnalysis.cpp
	SRCS_MIN+= Analysis/CFLSteensAliasAnalysis.cpp
	SRCS_MIN+= Analysis/CGSCCPassManager.cpp
	SRCS_MIN+= Analysis/CallGraph.cpp
	SRCS_MIN+= Analysis/CallGraphSCCPass.cpp
	SRCS_MIN+= Analysis/CallPrinter.cpp
	SRCS_MIN+= Analysis/CaptureTracking.cpp
	SRCS_MIN+= Analysis/CmpInstAnalysis.cpp
	SRCS_MIN+= Analysis/CodeMetrics.cpp
	SRCS_MIN+= Analysis/ConstantFolding.cpp
	SRCS_MIN+= Analysis/CostModel.cpp
	SRCS_MIN+= Analysis/Delinearization.cpp
	SRCS_MIN+= Analysis/DemandedBits.cpp
	SRCS_MIN+= Analysis/DependenceAnalysis.cpp
	SRCS_MIN+= Analysis/DivergenceAnalysis.cpp
	SRCS_MIN+= Analysis/DomPrinter.cpp
	SRCS_MIN+= Analysis/DominanceFrontier.cpp
	SRCS_MIN+= Analysis/EHPersonalities.cpp
	SRCS_MIN+= Analysis/GlobalsModRef.cpp
	SRCS_MIN+= Analysis/IVUsers.cpp
	SRCS_MIN+= Analysis/IndirectCallPromotionAnalysis.cpp
	SRCS_MIN+= Analysis/InlineCost.cpp
	SRCS_MIN+= Analysis/InstCount.cpp
	SRCS_MIN+= Analysis/InstructionSimplify.cpp
	SRCS_MIN+= Analysis/Interval.cpp
	SRCS_MIN+= Analysis/IntervalPartition.cpp
	SRCS_MIN+= Analysis/IteratedDominanceFrontier.cpp
	SRCS_MIN+= Analysis/LazyBlockFrequencyInfo.cpp
	SRCS_MIN+= Analysis/LazyBranchProbabilityInfo.cpp
	SRCS_MIN+= Analysis/LazyCallGraph.cpp
	SRCS_MIN+= Analysis/LazyValueInfo.cpp
	SRCS_MIN+= Analysis/Lint.cpp
	SRCS_MIN+= Analysis/Loads.cpp
	SRCS_MIN+= Analysis/LoopAccessAnalysis.cpp
	SRCS_MIN+= Analysis/LoopAnalysisManager.cpp
	SRCS_MIN+= Analysis/LoopInfo.cpp
	SRCS_MIN+= Analysis/LoopPass.cpp
	SRCS_MIN+= Analysis/LoopUnrollAnalyzer.cpp
	SRCS_MIN+= Analysis/MemDepPrinter.cpp
	SRCS_MIN+= Analysis/MemDerefPrinter.cpp
	SRCS_MIN+= Analysis/MemoryBuiltins.cpp
	SRCS_MIN+= Analysis/MemoryDependenceAnalysis.cpp
	SRCS_MIN+= Analysis/MemoryLocation.cpp
	SRCS_MIN+= Analysis/MemorySSA.cpp
	SRCS_MIN+= Analysis/MemorySSAUpdater.cpp
	SRCS_MIN+= Analysis/ModuleDebugInfoPrinter.cpp
	SRCS_MIN+= Analysis/ModuleSummaryAnalysis.cpp
	SRCS_MIN+= Analysis/ObjCARCAliasAnalysis.cpp
	SRCS_MIN+= Analysis/ObjCARCAnalysisUtils.cpp
	SRCS_MIN+= Analysis/ObjCARCInstKind.cpp
	SRCS_MIN+= Analysis/OptimizationRemarkEmitter.cpp
	SRCS_MIN+= Analysis/OrderedBasicBlock.cpp
	SRCS_MIN+= Analysis/PHITransAddr.cpp
	SRCS_MIN+= Analysis/PostDominators.cpp
	SRCS_MIN+= Analysis/ProfileSummaryInfo.cpp
	SRCS_MIN+= Analysis/PtrUseVisitor.cpp
	SRCS_MIN+= Analysis/RegionInfo.cpp
	SRCS_MIN+= Analysis/RegionPass.cpp
	SRCS_MIN+= Analysis/RegionPrinter.cpp
	SRCS_MIN+= Analysis/ScalarEvolution.cpp
	SRCS_MIN+= Analysis/ScalarEvolutionAliasAnalysis.cpp
	SRCS_MIN+= Analysis/ScalarEvolutionExpander.cpp
	SRCS_MIN+= Analysis/ScalarEvolutionNormalization.cpp
	SRCS_MIN+= Analysis/ScopedNoAliasAA.cpp
	SRCS_MIN+= Analysis/TargetLibraryInfo.cpp
	SRCS_MIN+= Analysis/TargetTransformInfo.cpp
	SRCS_MIN+= Analysis/Trace.cpp
	SRCS_MIN+= Analysis/TypeBasedAliasAnalysis.cpp
	SRCS_MIN+= Analysis/TypeMetadataUtils.cpp
	SRCS_MIN+= Analysis/ValueLattice.cpp
	SRCS_MIN+= Analysis/ValueLatticeUtils.cpp
	SRCS_MIN+= Analysis/ValueTracking.cpp
	SRCS_MIN+= Analysis/VectorUtils.cpp
	SRCS_MIN+= AsmParser/LLLexer.cpp
	SRCS_MIN+= AsmParser/LLParser.cpp
	SRCS_MIN+= AsmParser/Parser.cpp
	SRCS_MIN+= BinaryFormat/Dwarf.cpp
	SRCS_MIN+= BinaryFormat/Magic.cpp
	SRCS_MIN+= Bitcode/Reader/BitReader.cpp
	SRCS_MIN+= Bitcode/Reader/BitcodeReader.cpp
	SRCS_MIN+= Bitcode/Reader/BitstreamReader.cpp
	SRCS_MIN+= Bitcode/Reader/MetadataLoader.cpp
	SRCS_MIN+= Bitcode/Reader/ValueList.cpp
	SRCS_MIN+= Bitcode/Writer/BitWriter.cpp
	SRCS_MIN+= Bitcode/Writer/BitcodeWriter.cpp
	SRCS_MIN+= Bitcode/Writer/BitcodeWriterPass.cpp
	SRCS_MIN+= Bitcode/Writer/ValueEnumerator.cpp
	SRCS_MIN+= CodeGen/AggressiveAntiDepBreaker.cpp
	SRCS_MIN+= CodeGen/AllocationOrder.cpp
	SRCS_MIN+= CodeGen/Analysis.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/ARMException.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/AddressPool.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/AsmPrinter.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/CodeViewDebug.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DIE.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DIEHash.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DebugHandlerBase.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DebugLocStream.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfAccelTable.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfCFIException.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfCompileUnit.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfDebug.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfExpression.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfFile.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfStringPool.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/DwarfUnit.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/EHStreamer.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/ErlangGCPrinter.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/OcamlGCPrinter.cpp
	SRCS_MIN+= CodeGen/AsmPrinter/WinException.cpp
	SRCS_MIN+= CodeGen/AtomicExpandPass.cpp
	SRCS_MIN+= CodeGen/BasicTargetTransformInfo.cpp
	SRCS_MIN+= CodeGen/BranchFolding.cpp
	SRCS_MIN+= CodeGen/BranchRelaxation.cpp
	SRCS_MIN+= CodeGen/BuiltinGCs.cpp
	SRCS_MIN+= CodeGen/CalcSpillWeights.cpp
	SRCS_MIN+= CodeGen/CallingConvLower.cpp
	SRCS_MIN+= CodeGen/CodeGen.cpp
	SRCS_MIN+= CodeGen/CodeGenPrepare.cpp
	SRCS_MIN+= CodeGen/CriticalAntiDepBreaker.cpp
	SRCS_MIN+= CodeGen/DFAPacketizer.cpp
	SRCS_MIN+= CodeGen/DeadMachineInstructionElim.cpp
	SRCS_MIN+= CodeGen/DetectDeadLanes.cpp
	SRCS_MIN+= CodeGen/DwarfEHPrepare.cpp
	SRCS_MIN+= CodeGen/EarlyIfConversion.cpp
	SRCS_MIN+= CodeGen/EdgeBundles.cpp
	SRCS_MIN+= CodeGen/ExecutionDepsFix.cpp
	SRCS_MIN+= CodeGen/ExpandISelPseudos.cpp
	SRCS_MIN+= CodeGen/ExpandMemCmp.cpp
	SRCS_MIN+= CodeGen/ExpandPostRAPseudos.cpp
	SRCS_MIN+= CodeGen/ExpandReductions.cpp
	SRCS_MIN+= CodeGen/FEntryInserter.cpp
	SRCS_MIN+= CodeGen/FaultMaps.cpp
	SRCS_MIN+= CodeGen/FuncletLayout.cpp
	SRCS_MIN+= CodeGen/GCMetadata.cpp
	SRCS_MIN+= CodeGen/GCMetadataPrinter.cpp
	SRCS_MIN+= CodeGen/GCRootLowering.cpp
	SRCS_MIN+= CodeGen/GCStrategy.cpp
	SRCS_MIN+= CodeGen/GlobalISel/CallLowering.cpp
	SRCS_MIN+= CodeGen/GlobalISel/GlobalISel.cpp
	SRCS_MIN+= CodeGen/GlobalISel/IRTranslator.cpp
	SRCS_MIN+= CodeGen/GlobalISel/InstructionSelect.cpp
	SRCS_MIN+= CodeGen/GlobalISel/InstructionSelector.cpp
	SRCS_MIN+= CodeGen/GlobalISel/Legalizer.cpp
	SRCS_MIN+= CodeGen/GlobalISel/LegalizerHelper.cpp
	SRCS_MIN+= CodeGen/GlobalISel/LegalizerInfo.cpp
	SRCS_MIN+= CodeGen/GlobalISel/Localizer.cpp
	SRCS_MIN+= CodeGen/GlobalISel/MachineIRBuilder.cpp
	SRCS_MIN+= CodeGen/GlobalISel/RegBankSelect.cpp
	SRCS_MIN+= CodeGen/GlobalISel/RegisterBank.cpp
	SRCS_MIN+= CodeGen/GlobalISel/RegisterBankInfo.cpp
	SRCS_MIN+= CodeGen/GlobalISel/Utils.cpp
	SRCS_MIN+= CodeGen/GlobalMerge.cpp
	SRCS_MIN+= CodeGen/IfConversion.cpp
	SRCS_MIN+= CodeGen/ImplicitNullChecks.cpp
	+SRCS_MIN+= CodeGen/IndirectBrExpandPass.cpp
	SRCS_MIN+= CodeGen/InlineSpiller.cpp
	SRCS_MIN+= CodeGen/InterferenceCache.cpp
	SRCS_MIN+= CodeGen/InterleavedAccessPass.cpp
	SRCS_MIN+= CodeGen/IntrinsicLowering.cpp
	SRCS_MIN+= CodeGen/LLVMTargetMachine.cpp
	SRCS_MIN+= CodeGen/LatencyPriorityQueue.cpp
	SRCS_MIN+= CodeGen/LazyMachineBlockFrequencyInfo.cpp
	SRCS_MIN+= CodeGen/LexicalScopes.cpp
	SRCS_MIN+= CodeGen/LiveDebugValues.cpp
	SRCS_MIN+= CodeGen/LiveDebugVariables.cpp
	SRCS_MIN+= CodeGen/LiveInterval.cpp
	SRCS_MIN+= CodeGen/LiveIntervalUnion.cpp
	SRCS_MIN+= CodeGen/LiveIntervals.cpp
	SRCS_MIN+= CodeGen/LivePhysRegs.cpp
	SRCS_MIN+= CodeGen/LiveRangeCalc.cpp
	SRCS_MIN+= CodeGen/LiveRangeEdit.cpp
	SRCS_MIN+= CodeGen/LiveRangeShrink.cpp
	SRCS_MIN+= CodeGen/LiveRegMatrix.cpp
	SRCS_MIN+= CodeGen/LiveRegUnits.cpp
	SRCS_MIN+= CodeGen/LiveStacks.cpp
	SRCS_MIN+= CodeGen/LiveVariables.cpp
	SRCS_MIN+= CodeGen/LocalStackSlotAllocation.cpp
	SRCS_MIN+= CodeGen/LowLevelType.cpp
	SRCS_MIN+= CodeGen/LowerEmuTLS.cpp
	SRCS_MIN+= CodeGen/MIRCanonicalizerPass.cpp
	SRCS_EXT+= CodeGen/MIRParser/MILexer.cpp
	SRCS_EXT+= CodeGen/MIRParser/MIParser.cpp
	SRCS_EXT+= CodeGen/MIRParser/MIRParser.cpp
	SRCS_MIN+= CodeGen/MIRPrinter.cpp
	SRCS_MIN+= CodeGen/MIRPrintingPass.cpp
	SRCS_MIN+= CodeGen/MachineBasicBlock.cpp
	SRCS_MIN+= CodeGen/MachineBlockFrequencyInfo.cpp
	SRCS_MIN+= CodeGen/MachineBlockPlacement.cpp
	SRCS_MIN+= CodeGen/MachineBranchProbabilityInfo.cpp
	SRCS_MIN+= CodeGen/MachineCSE.cpp
	SRCS_MIN+= CodeGen/MachineCombiner.cpp
	SRCS_MIN+= CodeGen/MachineCopyPropagation.cpp
	SRCS_MIN+= CodeGen/MachineDominanceFrontier.cpp
	SRCS_MIN+= CodeGen/MachineDominators.cpp
	SRCS_MIN+= CodeGen/MachineFrameInfo.cpp
	SRCS_MIN+= CodeGen/MachineFunction.cpp
	SRCS_MIN+= CodeGen/MachineFunctionPass.cpp
	SRCS_MIN+= CodeGen/MachineFunctionPrinterPass.cpp
	SRCS_MIN+= CodeGen/MachineInstr.cpp
	SRCS_MIN+= CodeGen/MachineInstrBundle.cpp
	SRCS_MIN+= CodeGen/MachineLICM.cpp
	SRCS_MIN+= CodeGen/MachineLoopInfo.cpp
	SRCS_MIN+= CodeGen/MachineModuleInfo.cpp
	SRCS_MIN+= CodeGen/MachineModuleInfoImpls.cpp
	SRCS_MIN+= CodeGen/MachineOperand.cpp
	SRCS_MIN+= CodeGen/MachineOptimizationRemarkEmitter.cpp
	SRCS_MIN+= CodeGen/MachineOutliner.cpp
	SRCS_MIN+= CodeGen/MachinePassRegistry.cpp
	SRCS_MIN+= CodeGen/MachinePipeliner.cpp
	SRCS_MIN+= CodeGen/MachinePostDominators.cpp
	SRCS_MIN+= CodeGen/MachineRegionInfo.cpp
	SRCS_MIN+= CodeGen/MachineRegisterInfo.cpp
	SRCS_MIN+= CodeGen/MachineSSAUpdater.cpp
	SRCS_MIN+= CodeGen/MachineScheduler.cpp
	SRCS_MIN+= CodeGen/MachineSink.cpp
	SRCS_MIN+= CodeGen/MachineTraceMetrics.cpp
	SRCS_MIN+= CodeGen/MachineVerifier.cpp
	SRCS_MIN+= CodeGen/MacroFusion.cpp
	SRCS_MIN+= CodeGen/OptimizePHIs.cpp
	SRCS_MIN+= CodeGen/PHIElimination.cpp
	SRCS_MIN+= CodeGen/PHIEliminationUtils.cpp
	SRCS_MIN+= CodeGen/ParallelCG.cpp
	SRCS_MIN+= CodeGen/PatchableFunction.cpp
	SRCS_MIN+= CodeGen/PeepholeOptimizer.cpp
	SRCS_MIN+= CodeGen/PostRAHazardRecognizer.cpp
	SRCS_MIN+= CodeGen/PostRASchedulerList.cpp
	SRCS_MIN+= CodeGen/PreISelIntrinsicLowering.cpp
	SRCS_MIN+= CodeGen/ProcessImplicitDefs.cpp
	SRCS_MIN+= CodeGen/PrologEpilogInserter.cpp
	SRCS_MIN+= CodeGen/PseudoSourceValue.cpp
	SRCS_MIN+= CodeGen/RegAllocBase.cpp
	SRCS_MIN+= CodeGen/RegAllocBasic.cpp
	SRCS_MIN+= CodeGen/RegAllocFast.cpp
	SRCS_MIN+= CodeGen/RegAllocGreedy.cpp
	SRCS_MIN+= CodeGen/RegAllocPBQP.cpp
	SRCS_MIN+= CodeGen/RegUsageInfoCollector.cpp
	SRCS_MIN+= CodeGen/RegUsageInfoPropagate.cpp
	SRCS_MIN+= CodeGen/RegisterClassInfo.cpp
	SRCS_MIN+= CodeGen/RegisterCoalescer.cpp
	SRCS_MIN+= CodeGen/RegisterPressure.cpp
	SRCS_MIN+= CodeGen/RegisterScavenging.cpp
	SRCS_MIN+= CodeGen/RegisterUsageInfo.cpp
	SRCS_MIN+= CodeGen/RenameIndependentSubregs.cpp
	SRCS_MIN+= CodeGen/ResetMachineFunctionPass.cpp
	SRCS_MIN+= CodeGen/SafeStack.cpp
	SRCS_MIN+= CodeGen/SafeStackColoring.cpp
	SRCS_MIN+= CodeGen/SafeStackLayout.cpp
	SRCS_MIN+= CodeGen/ScalarizeMaskedMemIntrin.cpp
	SRCS_MIN+= CodeGen/ScheduleDAG.cpp
	SRCS_MIN+= CodeGen/ScheduleDAGInstrs.cpp
	SRCS_MIN+= CodeGen/ScheduleDAGPrinter.cpp
	SRCS_MIN+= CodeGen/ScoreboardHazardRecognizer.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/DAGCombiner.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/FastISel.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/InstrEmitter.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeDAG.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeTypes.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeVectorOps.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/ScheduleDAGFast.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAG.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAGDumper.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAGISel.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/StatepointLowering.cpp
	SRCS_MIN+= CodeGen/SelectionDAG/TargetLowering.cpp
	SRCS_MIN+= CodeGen/ShadowStackGCLowering.cpp
	SRCS_MIN+= CodeGen/ShrinkWrap.cpp
	SRCS_MIN+= CodeGen/SjLjEHPrepare.cpp
	SRCS_MIN+= CodeGen/SlotIndexes.cpp
	SRCS_MIN+= CodeGen/SpillPlacement.cpp
	SRCS_MIN+= CodeGen/SplitKit.cpp
	SRCS_MIN+= CodeGen/StackColoring.cpp
	SRCS_MIN+= CodeGen/StackMapLivenessAnalysis.cpp
	SRCS_MIN+= CodeGen/StackMaps.cpp
	SRCS_MIN+= CodeGen/StackProtector.cpp
	SRCS_MIN+= CodeGen/StackSlotColoring.cpp
	SRCS_MIN+= CodeGen/TailDuplication.cpp
	SRCS_MIN+= CodeGen/TailDuplicator.cpp
	SRCS_MIN+= CodeGen/TargetFrameLoweringImpl.cpp
	SRCS_MIN+= CodeGen/TargetInstrInfo.cpp
	SRCS_MIN+= CodeGen/TargetLoweringBase.cpp
	SRCS_MIN+= CodeGen/TargetLoweringObjectFileImpl.cpp
	SRCS_MIN+= CodeGen/TargetOptionsImpl.cpp
	SRCS_MIN+= CodeGen/TargetPassConfig.cpp
	SRCS_MIN+= CodeGen/TargetRegisterInfo.cpp
	SRCS_MIN+= CodeGen/TargetSchedule.cpp
	SRCS_MIN+= CodeGen/TargetSubtargetInfo.cpp
	SRCS_MIN+= CodeGen/TwoAddressInstructionPass.cpp
	SRCS_MIN+= CodeGen/UnreachableBlockElim.cpp
	SRCS_MIN+= CodeGen/VirtRegMap.cpp
	SRCS_MIN+= CodeGen/WinEHPrepare.cpp
	SRCS_MIN+= CodeGen/XRayInstrumentation.cpp
	SRCS_EXT+= DebugInfo/CodeView/AppendingTypeTableBuilder.cpp
	SRCS_MIN+= DebugInfo/CodeView/CVSymbolVisitor.cpp
	SRCS_MIN+= DebugInfo/CodeView/CVTypeVisitor.cpp
	SRCS_MIN+= DebugInfo/CodeView/CodeViewError.cpp
	SRCS_MIN+= DebugInfo/CodeView/CodeViewRecordIO.cpp
	SRCS_MIN+= DebugInfo/CodeView/ContinuationRecordBuilder.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugChecksumsSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugCrossExSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugCrossImpSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugFrameDataSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugLinesSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugStringTableSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugSubsectionRecord.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugSubsectionVisitor.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugSymbolRVASubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/DebugSymbolsSubsection.cpp
	SRCS_EXT+= DebugInfo/CodeView/EnumTables.cpp
	SRCS_MIN+= DebugInfo/CodeView/Formatters.cpp
	SRCS_MIN+= DebugInfo/CodeView/GlobalTypeTableBuilder.cpp
	SRCS_EXT+= DebugInfo/CodeView/LazyRandomTypeCollection.cpp
	SRCS_MIN+= DebugInfo/CodeView/Line.cpp
	SRCS_EXT+= DebugInfo/CodeView/MergingTypeTableBuilder.cpp
	SRCS_MIN+= DebugInfo/CodeView/RecordName.cpp
	SRCS_MIN+= DebugInfo/CodeView/RecordSerialization.cpp
	SRCS_MIN+= DebugInfo/CodeView/SimpleTypeSerializer.cpp
	SRCS_EXT+= DebugInfo/CodeView/StringsAndChecksums.cpp
	SRCS_MIN+= DebugInfo/CodeView/SymbolDumper.cpp
	SRCS_MIN+= DebugInfo/CodeView/SymbolRecordMapping.cpp
	SRCS_EXT+= DebugInfo/CodeView/SymbolSerializer.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeDumpVisitor.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeHashing.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeIndex.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeIndexDiscovery.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeRecordMapping.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeStreamMerger.cpp
	SRCS_MIN+= DebugInfo/CodeView/TypeTableCollection.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFAcceleratorTable.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFCompileUnit.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFContext.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDataExtractor.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugAbbrev.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugArangeSet.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugAranges.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugFrame.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugLine.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugLoc.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugMacro.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugPubTable.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDebugRangeList.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFDie.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFExpression.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFFormValue.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFGdbIndex.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFTypeUnit.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFUnit.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFUnitIndex.cpp
	SRCS_MIW+= DebugInfo/DWARF/DWARFVerifier.cpp
	SRCS_MIW+= DebugInfo/DWARF/SyntaxHighlighting.cpp
	SRCS_MIN+= DebugInfo/MSF/MSFBuilder.cpp
	SRCS_MIN+= DebugInfo/MSF/MSFCommon.cpp
	SRCS_MIN+= DebugInfo/MSF/MSFError.cpp
	SRCS_MIN+= DebugInfo/MSF/MappedBlockStream.cpp
	SRCS_EXT+= DebugInfo/PDB/GenericError.cpp
	SRCS_EXT+= DebugInfo/PDB/IPDBSourceFile.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/DbiModuleDescriptor.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/DbiModuleList.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/DbiStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/DbiStreamBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/EnumTables.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/GSIStreamBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/GlobalsStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/Hash.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/HashTable.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/InfoStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/InfoStreamBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/ModuleDebugStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NamedStreamMap.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeBuiltinSymbol.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeEnumModules.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeEnumSymbol.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeEnumTypes.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeExeSymbol.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeRawSymbol.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/NativeSession.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/PDBFile.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/PDBFileBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/PDBStringTable.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/PublicsStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/RawError.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/SymbolStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/TpiHashing.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/TpiStream.cpp
	SRCS_EXT+= DebugInfo/PDB/Native/TpiStreamBuilder.cpp
	SRCS_EXT+= DebugInfo/PDB/PDB.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBContext.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBExtras.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBInterfaceAnchors.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymDumper.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbol.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolAnnotation.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolBlock.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolCompiland.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolCustom.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolData.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolExe.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolFunc.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolLabel.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolThunk.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeArray.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeCustom.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeDimension.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeEnum.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeFriend.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeManaged.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypePointer.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeUDT.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeVTable.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolUnknown.cpp
	SRCS_EXT+= DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
	SRCS_EXT+= DebugInfo/PDB/UDTLayout.cpp
	SRCS_EXT+= DebugInfo/Symbolize/DIPrinter.cpp
	SRCS_MIW+= DebugInfo/Symbolize/SymbolizableObjectFile.cpp
	SRCS_MIW+= DebugInfo/Symbolize/Symbolize.cpp
	SRCS_MIN+= Demangle/ItaniumDemangle.cpp
	SRCS_XDB+= ExecutionEngine/ExecutionEngine.cpp
	SRCS_XDB+= ExecutionEngine/ExecutionEngineBindings.cpp
	SRCS_XDB+= ExecutionEngine/GDBRegistrationListener.cpp
	SRCS_XDB+= ExecutionEngine/Interpreter/Execution.cpp
	SRCS_XDB+= ExecutionEngine/Interpreter/ExternalFunctions.cpp
	SRCS_XDB+= ExecutionEngine/Interpreter/Interpreter.cpp
	SRCS_XDB+= ExecutionEngine/MCJIT/MCJIT.cpp
	SRCS_EXT+= ExecutionEngine/Orc/ExecutionUtils.cpp
	SRCS_EXT+= ExecutionEngine/Orc/IndirectionUtils.cpp
	SRCS_EXT+= ExecutionEngine/Orc/NullResolver.cpp
	SRCS_EXT+= ExecutionEngine/Orc/OrcABISupport.cpp
	SRCS_EXT+= ExecutionEngine/Orc/OrcCBindings.cpp
	SRCS_EXT+= ExecutionEngine/Orc/OrcError.cpp
	SRCS_EXT+= ExecutionEngine/Orc/OrcMCJITReplacement.cpp
	SRCS_EXT+= ExecutionEngine/Orc/RPCUtils.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/JITSymbol.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
	SRCS_XDB+= ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.cpp
	SRCS_XDB+= ExecutionEngine/SectionMemoryManager.cpp
	SRCS_XDB+= ExecutionEngine/TargetSelect.cpp
	SRCS_MIN+= IR/AsmWriter.cpp
	SRCS_MIN+= IR/Attributes.cpp
	SRCS_MIN+= IR/AutoUpgrade.cpp
	SRCS_MIN+= IR/BasicBlock.cpp
	SRCS_MIN+= IR/Comdat.cpp
	SRCS_MIN+= IR/ConstantFold.cpp
	SRCS_MIN+= IR/ConstantRange.cpp
	SRCS_MIN+= IR/Constants.cpp
	SRCS_MIN+= IR/Core.cpp
	SRCS_MIN+= IR/DIBuilder.cpp
	SRCS_MIN+= IR/DataLayout.cpp
	SRCS_MIN+= IR/DebugInfo.cpp
	SRCS_MIN+= IR/DebugInfoMetadata.cpp
	SRCS_MIN+= IR/DebugLoc.cpp
	SRCS_MIN+= IR/DiagnosticHandler.cpp
	SRCS_MIN+= IR/DiagnosticInfo.cpp
	SRCS_MIN+= IR/DiagnosticPrinter.cpp
	SRCS_MIN+= IR/Dominators.cpp
	SRCS_MIN+= IR/Function.cpp
	SRCS_MIN+= IR/GVMaterializer.cpp
	SRCS_MIN+= IR/Globals.cpp
	SRCS_MIN+= IR/IRBuilder.cpp
	SRCS_MIN+= IR/IRPrintingPasses.cpp
	SRCS_MIN+= IR/InlineAsm.cpp
	SRCS_MIN+= IR/Instruction.cpp
	SRCS_MIN+= IR/Instructions.cpp
	SRCS_MIN+= IR/IntrinsicInst.cpp
	SRCS_MIN+= IR/LLVMContext.cpp
	SRCS_MIN+= IR/LLVMContextImpl.cpp
	SRCS_MIN+= IR/LegacyPassManager.cpp
	SRCS_MIN+= IR/MDBuilder.cpp
	SRCS_MIN+= IR/Mangler.cpp
	SRCS_MIN+= IR/Metadata.cpp
	SRCS_MIN+= IR/Module.cpp
	SRCS_MIN+= IR/ModuleSummaryIndex.cpp
	SRCS_MIN+= IR/Operator.cpp
	SRCS_MIN+= IR/OptBisect.cpp
	SRCS_MIN+= IR/Pass.cpp
	SRCS_MIN+= IR/PassManager.cpp
	SRCS_MIN+= IR/PassRegistry.cpp
	SRCS_MIN+= IR/ProfileSummary.cpp
	SRCS_MIN+= IR/SafepointIRVerifier.cpp
	SRCS_MIN+= IR/Statepoint.cpp
	SRCS_MIN+= IR/Type.cpp
	SRCS_MIN+= IR/TypeFinder.cpp
	SRCS_MIN+= IR/Use.cpp
	SRCS_MIN+= IR/User.cpp
	SRCS_MIN+= IR/Value.cpp
	SRCS_MIN+= IR/ValueSymbolTable.cpp
	SRCS_MIN+= IR/ValueTypes.cpp
	SRCS_MIN+= IR/Verifier.cpp
	SRCS_MIN+= IRReader/IRReader.cpp
	SRCS_EXL+= LTO/Caching.cpp
	SRCS_MIN+= LTO/LTO.cpp
	SRCS_MIN+= LTO/LTOBackend.cpp
	SRCS_EXL+= LTO/LTOCodeGenerator.cpp
	SRCS_EXL+= LTO/LTOModule.cpp
	SRCS_EXL+= LTO/ThinLTOCodeGenerator.cpp
	SRCS_MIN+= LTO/UpdateCompilerUsed.cpp
	SRCS_MIN+= LineEditor/LineEditor.cpp
	SRCS_MIN+= Linker/IRMover.cpp
	SRCS_MIN+= Linker/LinkModules.cpp
	SRCS_MIN+= MC/ConstantPools.cpp
	SRCS_MIN+= MC/ELFObjectWriter.cpp
	SRCS_MIN+= MC/MCAsmBackend.cpp
	SRCS_MIN+= MC/MCAsmInfo.cpp
	SRCS_MIN+= MC/MCAsmInfoCOFF.cpp
	SRCS_MIN+= MC/MCAsmInfoDarwin.cpp
	SRCS_MIN+= MC/MCAsmInfoELF.cpp
	SRCS_MIN+= MC/MCAsmStreamer.cpp
	SRCS_MIN+= MC/MCAssembler.cpp
	SRCS_MIN+= MC/MCCodeEmitter.cpp
	SRCS_MIN+= MC/MCCodePadder.cpp
	SRCS_MIN+= MC/MCCodeView.cpp
	SRCS_MIN+= MC/MCContext.cpp
	SRCS_XDL+= MC/MCDisassembler/Disassembler.cpp
	SRCS_XDW+= MC/MCDisassembler/MCDisassembler.cpp
	SRCS_XDW+= MC/MCDisassembler/MCExternalSymbolizer.cpp
	SRCS_MIN+= MC/MCDisassembler/MCRelocationInfo.cpp
	SRCS_XDW+= MC/MCDisassembler/MCSymbolizer.cpp
	SRCS_MIN+= MC/MCDwarf.cpp
	SRCS_MIN+= MC/MCELFObjectTargetWriter.cpp
	SRCS_MIN+= MC/MCELFStreamer.cpp
	SRCS_MIN+= MC/MCExpr.cpp
	SRCS_MIN+= MC/MCFragment.cpp
	SRCS_MIN+= MC/MCInst.cpp
	SRCS_MIN+= MC/MCInstPrinter.cpp
	SRCS_MIN+= MC/MCInstrAnalysis.cpp
	SRCS_MIN+= MC/MCInstrDesc.cpp
	SRCS_MIN+= MC/MCLinkerOptimizationHint.cpp
	SRCS_MIN+= MC/MCMachOStreamer.cpp
	SRCS_MIN+= MC/MCMachObjectTargetWriter.cpp
	SRCS_MIN+= MC/MCNullStreamer.cpp
	SRCS_MIN+= MC/MCObjectFileInfo.cpp
	SRCS_MIN+= MC/MCObjectStreamer.cpp
	SRCS_MIN+= MC/MCObjectWriter.cpp
	SRCS_MIN+= MC/MCParser/AsmLexer.cpp
	SRCS_MIN+= MC/MCParser/AsmParser.cpp
	SRCS_MIN+= MC/MCParser/COFFAsmParser.cpp
	SRCS_MIN+= MC/MCParser/DarwinAsmParser.cpp
	SRCS_MIN+= MC/MCParser/ELFAsmParser.cpp
	SRCS_MIN+= MC/MCParser/MCAsmLexer.cpp
	SRCS_MIN+= MC/MCParser/MCAsmParser.cpp
	SRCS_MIN+= MC/MCParser/MCAsmParserExtension.cpp
	SRCS_MIN+= MC/MCParser/MCTargetAsmParser.cpp
	SRCS_MIN+= MC/MCRegisterInfo.cpp
	SRCS_MIN+= MC/MCSchedule.cpp
	SRCS_MIN+= MC/MCSection.cpp
	SRCS_MIN+= MC/MCSectionCOFF.cpp
	SRCS_MIN+= MC/MCSectionELF.cpp
	SRCS_MIN+= MC/MCSectionMachO.cpp
	SRCS_MIN+= MC/MCSectionWasm.cpp
	SRCS_MIN+= MC/MCStreamer.cpp
	SRCS_MIN+= MC/MCSubtargetInfo.cpp
	SRCS_MIN+= MC/MCSymbol.cpp
	SRCS_MIN+= MC/MCSymbolELF.cpp
	SRCS_MIN+= MC/MCTargetOptions.cpp
	SRCS_MIN+= MC/MCValue.cpp
	SRCS_MIN+= MC/MCWasmStreamer.cpp
	SRCS_MIN+= MC/MCWin64EH.cpp
	SRCS_MIN+= MC/MCWinCOFFStreamer.cpp
	SRCS_MIN+= MC/MCWinEH.cpp
	SRCS_MIN+= MC/MachObjectWriter.cpp
	SRCS_MIN+= MC/StringTableBuilder.cpp
	SRCS_MIN+= MC/SubtargetFeature.cpp
	SRCS_MIN+= MC/WinCOFFObjectWriter.cpp
	SRCS_MIN+= Object/Archive.cpp
	SRCS_MIN+= Object/ArchiveWriter.cpp
	SRCS_MIN+= Object/Binary.cpp
	SRCS_EXT+= Object/COFFImportFile.cpp
	SRCS_EXT+= Object/COFFModuleDefinition.cpp
	SRCS_MIN+= Object/COFFObjectFile.cpp
	SRCS_MIN+= Object/Decompressor.cpp
	SRCS_MIN+= Object/ELF.cpp
	SRCS_MIN+= Object/ELFObjectFile.cpp
	SRCS_MIN+= Object/Error.cpp
	SRCS_MIN+= Object/IRObjectFile.cpp
	SRCS_MIN+= Object/IRSymtab.cpp
	SRCS_MIN+= Object/MachOObjectFile.cpp
	SRCS_MIN+= Object/MachOUniversal.cpp
	SRCS_MIN+= Object/ModuleSymbolTable.cpp
	SRCS_EXT+= Object/Object.cpp
	SRCS_MIN+= Object/ObjectFile.cpp
	SRCS_MIN+= Object/RecordStreamer.cpp
	SRCS_MIW+= Object/SymbolSize.cpp
	SRCS_MIN+= Object/SymbolicFile.cpp
	SRCS_MIN+= Object/WasmObjectFile.cpp
	SRCS_MIN+= Object/WindowsResource.cpp
	SRCS_MIN+= ObjectYAML/COFFYAML.cpp
	SRCS_EXT+= ObjectYAML/CodeViewYAMLDebugSections.cpp
	SRCS_EXT+= ObjectYAML/CodeViewYAMLSymbols.cpp
	SRCS_EXT+= ObjectYAML/CodeViewYAMLTypes.cpp
	SRCS_MIN+= ObjectYAML/DWARFYAML.cpp
	SRCS_MIN+= ObjectYAML/ELFYAML.cpp
	SRCS_MIN+= ObjectYAML/MachOYAML.cpp
	SRCS_EXT+= ObjectYAML/YAML.cpp
	SRCS_MIN+= Option/Arg.cpp
	SRCS_MIN+= Option/ArgList.cpp
	SRCS_MIN+= Option/OptTable.cpp
	SRCS_MIN+= Option/Option.cpp
	SRCS_MIN+= Passes/PassBuilder.cpp
	SRCS_MIN+= ProfileData/Coverage/CoverageMapping.cpp
	SRCS_MIN+= ProfileData/Coverage/CoverageMappingReader.cpp
	SRCS_MIN+= ProfileData/Coverage/CoverageMappingWriter.cpp
	SRCS_EXT+= ProfileData/GCOV.cpp
	SRCS_MIN+= ProfileData/InstrProf.cpp
	SRCS_MIN+= ProfileData/InstrProfReader.cpp
	SRCS_MIN+= ProfileData/InstrProfWriter.cpp
	SRCS_MIN+= ProfileData/ProfileSummaryBuilder.cpp
	SRCS_MIN+= ProfileData/SampleProf.cpp
	SRCS_MIN+= ProfileData/SampleProfReader.cpp
	SRCS_EXT+= ProfileData/SampleProfWriter.cpp
	SRCS_MIN+= Support/APFloat.cpp
	SRCS_MIN+= Support/APInt.cpp
	SRCS_MIN+= Support/APSInt.cpp
	SRCS_MIN+= Support/ARMAttributeParser.cpp
	SRCS_MIN+= Support/ARMBuildAttrs.cpp
	SRCS_MIN+= Support/Allocator.cpp
	SRCS_MIN+= Support/Atomic.cpp
	SRCS_MIN+= Support/BinaryStreamError.cpp
	SRCS_MIN+= Support/BinaryStreamReader.cpp
	SRCS_MIN+= Support/BinaryStreamRef.cpp
	SRCS_MIN+= Support/BinaryStreamWriter.cpp
	SRCS_MIN+= Support/BlockFrequency.cpp
	SRCS_MIN+= Support/BranchProbability.cpp
	SRCS_EXT+= Support/COM.cpp
	SRCS_MIN+= Support/CachePruning.cpp
	SRCS_MIN+= Support/Chrono.cpp
	SRCS_MIN+= Support/CodeGenCoverage.cpp
	SRCS_MIN+= Support/CommandLine.cpp
	SRCS_MIN+= Support/Compression.cpp
	SRCS_MIN+= Support/ConvertUTF.cpp
	SRCS_MIN+= Support/ConvertUTFWrapper.cpp
	SRCS_MIN+= Support/CrashRecoveryContext.cpp
	SRCS_MIN+= Support/DAGDeltaAlgorithm.cpp
	SRCS_MIN+= Support/DataExtractor.cpp
	SRCS_MIN+= Support/Debug.cpp
	SRCS_MIN+= Support/DebugCounter.cpp
	SRCS_MIN+= Support/DeltaAlgorithm.cpp
	SRCS_MIN+= Support/DynamicLibrary.cpp
	SRCS_MIN+= Support/Errno.cpp
	SRCS_MIN+= Support/Error.cpp
	SRCS_MIN+= Support/ErrorHandling.cpp
	SRCS_EXL+= Support/FileOutputBuffer.cpp
	SRCS_EXT+= Support/FileUtilities.cpp
	SRCS_MIN+= Support/FoldingSet.cpp
	SRCS_MIN+= Support/FormatVariadic.cpp
	SRCS_MIN+= Support/FormattedStream.cpp
	SRCS_MIN+= Support/GlobPattern.cpp
	SRCS_MIN+= Support/GraphWriter.cpp
	SRCS_MIN+= Support/Hashing.cpp
	SRCS_MIN+= Support/Host.cpp
	SRCS_MIN+= Support/IntEqClasses.cpp
	SRCS_MIN+= Support/IntervalMap.cpp
	SRCS_MIN+= Support/JamCRC.cpp
	SRCS_MIN+= Support/KnownBits.cpp
	SRCS_MIN+= Support/LEB128.cpp
	SRCS_MIN+= Support/LineIterator.cpp
	SRCS_MIN+= Support/Locale.cpp
	SRCS_MIN+= Support/LockFileManager.cpp
	SRCS_MIN+= Support/LowLevelType.cpp
	SRCS_MIN+= Support/MD5.cpp
	SRCS_MIN+= Support/ManagedStatic.cpp
	SRCS_MIN+= Support/MathExtras.cpp
	SRCS_XDL+= Support/Memory.cpp
	SRCS_MIN+= Support/MemoryBuffer.cpp
	SRCS_MIN+= Support/Mutex.cpp
	SRCS_MIN+= Support/NativeFormatting.cpp
	SRCS_MIN+= Support/Options.cpp
	SRCS_LLD+= Support/Parallel.cpp
	SRCS_MIN+= Support/Path.cpp
	SRCS_MIN+= Support/PluginLoader.cpp
	SRCS_MIN+= Support/PrettyStackTrace.cpp
	SRCS_MIN+= Support/Process.cpp
	SRCS_MIN+= Support/Program.cpp
	SRCS_MIN+= Support/RWMutex.cpp
	SRCS_MIN+= Support/RandomNumberGenerator.cpp
	SRCS_MIN+= Support/Regex.cpp
	SRCS_MIN+= Support/SHA1.cpp
	SRCS_MIN+= Support/ScaledNumber.cpp
	SRCS_MIN+= Support/ScopedPrinter.cpp
	SRCS_MIN+= Support/Signals.cpp
	SRCS_MIN+= Support/SmallPtrSet.cpp
	SRCS_MIN+= Support/SmallVector.cpp
	SRCS_MIN+= Support/SourceMgr.cpp
	SRCS_MIN+= Support/SpecialCaseList.cpp
	SRCS_MIN+= Support/Statistic.cpp
	SRCS_MIN+= Support/StringExtras.cpp
	SRCS_MIN+= Support/StringMap.cpp
	SRCS_MIN+= Support/StringRef.cpp
	SRCS_MIN+= Support/StringSaver.cpp
	SRCS_EXT+= Support/SystemUtils.cpp
	SRCS_LLD+= Support/TarWriter.cpp
	SRCS_MIN+= Support/TargetParser.cpp
	SRCS_MIN+= Support/TargetRegistry.cpp
	SRCS_MIN+= Support/ThreadLocal.cpp
	SRCS_MIN+= Support/ThreadPool.cpp
	SRCS_MIN+= Support/Threading.cpp
	SRCS_MIN+= Support/Timer.cpp
	SRCS_MIN+= Support/ToolOutputFile.cpp
	SRCS_MIN+= Support/TrigramIndex.cpp
	SRCS_MIN+= Support/Triple.cpp
	SRCS_MIN+= Support/Twine.cpp
	SRCS_MIN+= Support/Unicode.cpp
	SRCS_MIN+= Support/Valgrind.cpp
	SRCS_MIN+= Support/YAMLParser.cpp
	SRCS_MIN+= Support/YAMLTraits.cpp
	SRCS_MIN+= Support/circular_raw_ostream.cpp
	SRCS_MIN+= Support/raw_os_ostream.cpp
	SRCS_MIN+= Support/raw_ostream.cpp
	SRCS_MIN+= Support/regcomp.c
	SRCS_MIN+= Support/regerror.c
	SRCS_MIN+= Support/regexec.c
	SRCS_MIN+= Support/regfree.c
	SRCS_MIN+= Support/regstrlcpy.c
	SRCS_LLD+= Support/xxhash.cpp
	SRCS_MIN+= TableGen/Error.cpp
	SRCS_MIN+= TableGen/Main.cpp
	SRCS_MIN+= TableGen/Record.cpp
	SRCS_MIN+= TableGen/SetTheory.cpp
	SRCS_MIN+= TableGen/StringMatcher.cpp
	SRCS_MIN+= TableGen/TGLexer.cpp
	SRCS_MIN+= TableGen/TGParser.cpp
	SRCS_MIN+= TableGen/TableGenBackend.cpp
	SRCS_MIN+= Target/AArch64/AArch64A53Fix835769.cpp
	SRCS_MIN+= Target/AArch64/AArch64A57FPLoadBalancing.cpp
	SRCS_MIN+= Target/AArch64/AArch64AdvSIMDScalarPass.cpp
	SRCS_MIN+= Target/AArch64/AArch64AsmPrinter.cpp
	SRCS_MIN+= Target/AArch64/AArch64CallLowering.cpp
	SRCS_MIN+= Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
	SRCS_MIN+= Target/AArch64/AArch64CollectLOH.cpp
	SRCS_MIN+= Target/AArch64/AArch64CondBrTuning.cpp
	SRCS_MIN+= Target/AArch64/AArch64ConditionOptimizer.cpp
	SRCS_MIN+= Target/AArch64/AArch64ConditionalCompares.cpp
	SRCS_MIN+= Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
	SRCS_MIN+= Target/AArch64/AArch64ExpandPseudoInsts.cpp
	SRCS_MIN+= Target/AArch64/AArch64FalkorHWPFFix.cpp
	SRCS_MIN+= Target/AArch64/AArch64FastISel.cpp
	SRCS_MIN+= Target/AArch64/AArch64FrameLowering.cpp
	SRCS_MIN+= Target/AArch64/AArch64ISelDAGToDAG.cpp
	SRCS_MIN+= Target/AArch64/AArch64ISelLowering.cpp
	SRCS_MIN+= Target/AArch64/AArch64InstrInfo.cpp
	SRCS_MIN+= Target/AArch64/AArch64InstructionSelector.cpp
	SRCS_MIN+= Target/AArch64/AArch64LegalizerInfo.cpp
	SRCS_MIN+= Target/AArch64/AArch64LoadStoreOptimizer.cpp
	SRCS_MIN+= Target/AArch64/AArch64MCInstLower.cpp
	SRCS_MIN+= Target/AArch64/AArch64MacroFusion.cpp
	SRCS_MIN+= Target/AArch64/AArch64PBQPRegAlloc.cpp
	SRCS_MIN+= Target/AArch64/AArch64PromoteConstant.cpp
	SRCS_MIN+= Target/AArch64/AArch64RedundantCopyElimination.cpp
	SRCS_MIN+= Target/AArch64/AArch64RegisterBankInfo.cpp
	SRCS_MIN+= Target/AArch64/AArch64RegisterInfo.cpp
	SRCS_MIN+= Target/AArch64/AArch64SIMDInstrOpt.cpp
	SRCS_MIN+= Target/AArch64/AArch64SelectionDAGInfo.cpp
	SRCS_MIN+= Target/AArch64/AArch64StorePairSuppress.cpp
	SRCS_MIN+= Target/AArch64/AArch64Subtarget.cpp
	SRCS_MIN+= Target/AArch64/AArch64TargetMachine.cpp
	SRCS_MIN+= Target/AArch64/AArch64TargetObjectFile.cpp
	SRCS_MIN+= Target/AArch64/AArch64TargetTransformInfo.cpp
	SRCS_MIN+= Target/AArch64/AsmParser/AArch64AsmParser.cpp
	SRCS_XDW+= Target/AArch64/Disassembler/AArch64Disassembler.cpp
	SRCS_XDW+= Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
	SRCS_MIN+= Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
	SRCS_MIN+= Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
	SRCS_MIN+= Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
	SRCS_MIN+= Target/AArch64/Utils/AArch64BaseInfo.cpp
	SRCS_MIN+= Target/ARM/A15SDOptimizer.cpp
	SRCS_MIN+= Target/ARM/ARMAsmPrinter.cpp
	SRCS_MIN+= Target/ARM/ARMBaseInstrInfo.cpp
	SRCS_MIN+= Target/ARM/ARMBaseRegisterInfo.cpp
	SRCS_MIN+= Target/ARM/ARMCallLowering.cpp
	SRCS_MIN+= Target/ARM/ARMComputeBlockSize.cpp
	SRCS_MIN+= Target/ARM/ARMConstantIslandPass.cpp
	SRCS_MIN+= Target/ARM/ARMConstantPoolValue.cpp
	SRCS_MIN+= Target/ARM/ARMExpandPseudoInsts.cpp
	SRCS_MIN+= Target/ARM/ARMFastISel.cpp
	SRCS_MIN+= Target/ARM/ARMFrameLowering.cpp
	SRCS_MIN+= Target/ARM/ARMHazardRecognizer.cpp
	SRCS_MIN+= Target/ARM/ARMISelDAGToDAG.cpp
	SRCS_MIN+= Target/ARM/ARMISelLowering.cpp
	SRCS_MIN+= Target/ARM/ARMInstrInfo.cpp
	SRCS_MIN+= Target/ARM/ARMInstructionSelector.cpp
	SRCS_MIN+= Target/ARM/ARMLegalizerInfo.cpp
	SRCS_MIN+= Target/ARM/ARMLoadStoreOptimizer.cpp
	SRCS_MIN+= Target/ARM/ARMMCInstLower.cpp
	SRCS_MIN+= Target/ARM/ARMMachineFunctionInfo.cpp
	SRCS_MIN+= Target/ARM/ARMMacroFusion.cpp
	SRCS_MIN+= Target/ARM/ARMOptimizeBarriersPass.cpp
	SRCS_MIN+= Target/ARM/ARMRegisterBankInfo.cpp
	SRCS_MIN+= Target/ARM/ARMRegisterInfo.cpp
	SRCS_MIN+= Target/ARM/ARMSelectionDAGInfo.cpp
	SRCS_MIN+= Target/ARM/ARMSubtarget.cpp
	SRCS_MIN+= Target/ARM/ARMTargetMachine.cpp
	SRCS_MIN+= Target/ARM/ARMTargetObjectFile.cpp
	SRCS_MIN+= Target/ARM/ARMTargetTransformInfo.cpp
	SRCS_MIN+= Target/ARM/AsmParser/ARMAsmParser.cpp
	SRCS_MIN+= Target/ARM/Disassembler/ARMDisassembler.cpp
	SRCS_MIN+= Target/ARM/InstPrinter/ARMInstPrinter.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMMCExpr.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
	SRCS_MIN+= Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
	SRCS_MIN+= Target/ARM/MLxExpansionPass.cpp
	SRCS_MIN+= Target/ARM/TargetInfo/ARMTargetInfo.cpp
	SRCS_MIN+= Target/ARM/Thumb1FrameLowering.cpp
	SRCS_MIN+= Target/ARM/Thumb1InstrInfo.cpp
	SRCS_MIN+= Target/ARM/Thumb2ITBlockPass.cpp
	SRCS_MIN+= Target/ARM/Thumb2InstrInfo.cpp
	SRCS_MIN+= Target/ARM/Thumb2SizeReduction.cpp
	SRCS_MIN+= Target/ARM/ThumbRegisterInfo.cpp
	SRCS_MIN+= Target/ARM/Utils/ARMBaseInfo.cpp
	SRCS_MIN+= Target/Mips/AsmParser/MipsAsmParser.cpp
	SRCS_XDW+= Target/Mips/Disassembler/MipsDisassembler.cpp
	SRCS_MIN+= Target/Mips/InstPrinter/MipsInstPrinter.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsABIInfo.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsMCExpr.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
	SRCS_MIN+= Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
	SRCS_MIN+= Target/Mips/MicroMipsSizeReduction.cpp
	SRCS_MIN+= Target/Mips/Mips16FrameLowering.cpp
	SRCS_MIN+= Target/Mips/Mips16HardFloat.cpp
	SRCS_MIN+= Target/Mips/Mips16HardFloatInfo.cpp
	SRCS_MIN+= Target/Mips/Mips16ISelDAGToDAG.cpp
	SRCS_MIN+= Target/Mips/Mips16ISelLowering.cpp
	SRCS_MIN+= Target/Mips/Mips16InstrInfo.cpp
	SRCS_MIN+= Target/Mips/Mips16RegisterInfo.cpp
	SRCS_MIN+= Target/Mips/MipsAnalyzeImmediate.cpp
	SRCS_MIN+= Target/Mips/MipsAsmPrinter.cpp
	SRCS_MIN+= Target/Mips/MipsCCState.cpp
	SRCS_MIN+= Target/Mips/MipsConstantIslandPass.cpp
	SRCS_MIN+= Target/Mips/MipsDelaySlotFiller.cpp
	SRCS_MIN+= Target/Mips/MipsFastISel.cpp
	SRCS_MIN+= Target/Mips/MipsFrameLowering.cpp
	SRCS_MIN+= Target/Mips/MipsHazardSchedule.cpp
	SRCS_MIN+= Target/Mips/MipsISelDAGToDAG.cpp
	SRCS_MIN+= Target/Mips/MipsISelLowering.cpp
	SRCS_MIN+= Target/Mips/MipsInstrInfo.cpp
	SRCS_MIN+= Target/Mips/MipsLongBranch.cpp
	SRCS_MIN+= Target/Mips/MipsMCInstLower.cpp
	SRCS_MIN+= Target/Mips/MipsMachineFunction.cpp
	SRCS_MIN+= Target/Mips/MipsModuleISelDAGToDAG.cpp
	SRCS_MIN+= Target/Mips/MipsOptimizePICCall.cpp
	SRCS_MIN+= Target/Mips/MipsOs16.cpp
	SRCS_MIN+= Target/Mips/MipsRegisterInfo.cpp
	SRCS_MIN+= Target/Mips/MipsSEFrameLowering.cpp
	SRCS_MIN+= Target/Mips/MipsSEISelDAGToDAG.cpp
	SRCS_MIN+= Target/Mips/MipsSEISelLowering.cpp
	SRCS_MIN+= Target/Mips/MipsSEInstrInfo.cpp
	SRCS_MIN+= Target/Mips/MipsSERegisterInfo.cpp
	SRCS_MIN+= Target/Mips/MipsSubtarget.cpp
	SRCS_MIN+= Target/Mips/MipsTargetMachine.cpp
	SRCS_MIN+= Target/Mips/MipsTargetObjectFile.cpp
	SRCS_MIN+= Target/Mips/TargetInfo/MipsTargetInfo.cpp
	SRCS_MIN+= Target/PowerPC/AsmParser/PPCAsmParser.cpp
	SRCS_MIN+= Target/PowerPC/Disassembler/PPCDisassembler.cpp
	SRCS_MIN+= Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
	SRCS_MIN+= Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
	SRCS_MIN+= Target/PowerPC/PPCAsmPrinter.cpp
	SRCS_MIN+= Target/PowerPC/PPCBoolRetToInt.cpp
	SRCS_MIN+= Target/PowerPC/PPCBranchCoalescing.cpp
	SRCS_MIN+= Target/PowerPC/PPCBranchSelector.cpp
	SRCS_MIN+= Target/PowerPC/PPCCCState.cpp
	SRCS_MIN+= Target/PowerPC/PPCCTRLoops.cpp
	SRCS_MIN+= Target/PowerPC/PPCEarlyReturn.cpp
	SRCS_MIN+= Target/PowerPC/PPCExpandISEL.cpp
	SRCS_MIN+= Target/PowerPC/PPCFastISel.cpp
	SRCS_MIN+= Target/PowerPC/PPCFrameLowering.cpp
	SRCS_MIN+= Target/PowerPC/PPCHazardRecognizers.cpp
	SRCS_MIN+= Target/PowerPC/PPCISelDAGToDAG.cpp
	SRCS_MIN+= Target/PowerPC/PPCISelLowering.cpp
	SRCS_MIN+= Target/PowerPC/PPCInstrInfo.cpp
	SRCS_MIN+= Target/PowerPC/PPCLoopPreIncPrep.cpp
	SRCS_MIN+= Target/PowerPC/PPCMCInstLower.cpp
	SRCS_MIN+= Target/PowerPC/PPCMIPeephole.cpp
	SRCS_MIN+= Target/PowerPC/PPCMachineFunctionInfo.cpp
	SRCS_MIN+= Target/PowerPC/PPCPreEmitPeephole.cpp
	SRCS_MIN+= Target/PowerPC/PPCQPXLoadSplat.cpp
	SRCS_MIN+= Target/PowerPC/PPCReduceCRLogicals.cpp
	SRCS_MIN+= Target/PowerPC/PPCRegisterInfo.cpp
	SRCS_MIN+= Target/PowerPC/PPCSubtarget.cpp
	SRCS_MIN+= Target/PowerPC/PPCTLSDynamicCall.cpp
	SRCS_MIN+= Target/PowerPC/PPCTOCRegDeps.cpp
	SRCS_MIN+= Target/PowerPC/PPCTargetMachine.cpp
	SRCS_MIN+= Target/PowerPC/PPCTargetObjectFile.cpp
	SRCS_MIN+= Target/PowerPC/PPCTargetTransformInfo.cpp
	SRCS_MIN+= Target/PowerPC/PPCVSXCopy.cpp
	SRCS_MIN+= Target/PowerPC/PPCVSXFMAMutate.cpp
	SRCS_MIN+= Target/PowerPC/PPCVSXSwapRemoval.cpp
	SRCS_MIN+= Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
	SRCS_MIN+= Target/Sparc/AsmParser/SparcAsmParser.cpp
	SRCS_MIN+= Target/Sparc/DelaySlotFiller.cpp
	SRCS_XDW+= Target/Sparc/Disassembler/SparcDisassembler.cpp
	SRCS_MIN+= Target/Sparc/InstPrinter/SparcInstPrinter.cpp
	SRCS_MIN+= Target/Sparc/LeonPasses.cpp
	SRCS_MIN+= Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
	SRCS_MIN+= Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
	SRCS_MIN+= Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
	SRCS_MIN+= Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
	SRCS_MIN+= Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
	SRCS_MIN+= Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
	SRCS_MIN+= Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
	SRCS_MIN+= Target/Sparc/SparcAsmPrinter.cpp
	SRCS_MIN+= Target/Sparc/SparcFrameLowering.cpp
	SRCS_MIN+= Target/Sparc/SparcISelDAGToDAG.cpp
	SRCS_MIN+= Target/Sparc/SparcISelLowering.cpp
	SRCS_MIN+= Target/Sparc/SparcInstrInfo.cpp
	SRCS_MIN+= Target/Sparc/SparcMCInstLower.cpp
	SRCS_MIN+= Target/Sparc/SparcMachineFunctionInfo.cpp
	SRCS_MIN+= Target/Sparc/SparcRegisterInfo.cpp
	SRCS_MIN+= Target/Sparc/SparcSubtarget.cpp
	SRCS_MIN+= Target/Sparc/SparcTargetMachine.cpp
	SRCS_MIN+= Target/Sparc/SparcTargetObjectFile.cpp
	SRCS_MIN+= Target/Sparc/TargetInfo/SparcTargetInfo.cpp
	SRCS_MIN+= Target/Target.cpp
	SRCS_MIN+= Target/TargetIntrinsicInfo.cpp
	SRCS_MIN+= Target/TargetLoweringObjectFile.cpp
	SRCS_MIN+= Target/TargetMachine.cpp
	SRCS_MIN+= Target/TargetMachineC.cpp
	SRCS_MIN+= Target/X86/AsmParser/X86AsmInstrumentation.cpp
	SRCS_MIN+= Target/X86/AsmParser/X86AsmParser.cpp
	SRCS_XDW+= Target/X86/Disassembler/X86Disassembler.cpp
	SRCS_XDW+= Target/X86/Disassembler/X86DisassemblerDecoder.cpp
	SRCS_MIN+= Target/X86/InstPrinter/X86ATTInstPrinter.cpp
	SRCS_MIN+= Target/X86/InstPrinter/X86InstComments.cpp
	SRCS_MIN+= Target/X86/InstPrinter/X86IntelInstPrinter.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86AsmBackend.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
	SRCS_MIN+= Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
	SRCS_MIN+= Target/X86/TargetInfo/X86TargetInfo.cpp
	SRCS_MIN+= Target/X86/Utils/X86ShuffleDecode.cpp
	SRCS_MIN+= Target/X86/X86AsmPrinter.cpp
	SRCS_MIN+= Target/X86/X86CallFrameOptimization.cpp
	SRCS_MIN+= Target/X86/X86CallLowering.cpp
	SRCS_MIN+= Target/X86/X86CallingConv.cpp
	SRCS_MIN+= Target/X86/X86CmovConversion.cpp
	SRCS_MIN+= Target/X86/X86DomainReassignment.cpp
	SRCS_MIN+= Target/X86/X86EvexToVex.cpp
	SRCS_MIN+= Target/X86/X86ExpandPseudo.cpp
	SRCS_MIN+= Target/X86/X86FastISel.cpp
	SRCS_MIN+= Target/X86/X86FixupBWInsts.cpp
	SRCS_MIN+= Target/X86/X86FixupLEAs.cpp
	SRCS_MIN+= Target/X86/X86FixupSetCC.cpp
	SRCS_MIN+= Target/X86/X86FloatingPoint.cpp
	SRCS_MIN+= Target/X86/X86FrameLowering.cpp
	SRCS_MIN+= Target/X86/X86ISelDAGToDAG.cpp
	SRCS_MIN+= Target/X86/X86ISelLowering.cpp
	SRCS_MIN+= Target/X86/X86InstrFMA3Info.cpp
	SRCS_MIN+= Target/X86/X86InstrInfo.cpp
	SRCS_MIN+= Target/X86/X86InstructionSelector.cpp
	SRCS_MIN+= Target/X86/X86InterleavedAccess.cpp
	SRCS_MIN+= Target/X86/X86LegalizerInfo.cpp
	SRCS_MIN+= Target/X86/X86MCInstLower.cpp
	SRCS_MIN+= Target/X86/X86MachineFunctionInfo.cpp
	SRCS_MIN+= Target/X86/X86MacroFusion.cpp
	SRCS_MIN+= Target/X86/X86OptimizeLEAs.cpp
	SRCS_MIN+= Target/X86/X86PadShortFunction.cpp
	SRCS_MIN+= Target/X86/X86RegisterBankInfo.cpp
	SRCS_MIN+= Target/X86/X86RegisterInfo.cpp
	+SRCS_MIN+= Target/X86/X86RetpolineThunks.cpp
	SRCS_MIN+= Target/X86/X86SelectionDAGInfo.cpp
	SRCS_MIN+= Target/X86/X86ShuffleDecodeConstantPool.cpp
	SRCS_MIN+= Target/X86/X86Subtarget.cpp
	SRCS_MIN+= Target/X86/X86TargetMachine.cpp
	SRCS_MIN+= Target/X86/X86TargetObjectFile.cpp
	SRCS_MIN+= Target/X86/X86TargetTransformInfo.cpp
	SRCS_MIN+= Target/X86/X86VZeroUpper.cpp
	SRCS_MIN+= Target/X86/X86WinAllocaExpander.cpp
	SRCS_MIN+= Target/X86/X86WinEHState.cpp
	SRCS_EXT+= ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
	SRCS_EXL+= ToolDrivers/llvm-lib/LibDriver.cpp
	SRCS_MIN+= Transforms/Coroutines/CoroCleanup.cpp
	SRCS_MIN+= Transforms/Coroutines/CoroEarly.cpp
	SRCS_MIN+= Transforms/Coroutines/CoroElide.cpp
	SRCS_MIN+= Transforms/Coroutines/CoroFrame.cpp
	SRCS_MIN+= Transforms/Coroutines/CoroSplit.cpp
	SRCS_MIN+= Transforms/Coroutines/Coroutines.cpp
	SRCS_MIN+= Transforms/IPO/AlwaysInliner.cpp
	SRCS_MIN+= Transforms/IPO/ArgumentPromotion.cpp
	SRCS_MIN+= Transforms/IPO/BarrierNoopPass.cpp
	SRCS_MIN+= Transforms/IPO/CalledValuePropagation.cpp
	SRCS_MIN+= Transforms/IPO/ConstantMerge.cpp
	SRCS_MIN+= Transforms/IPO/CrossDSOCFI.cpp
	SRCS_MIN+= Transforms/IPO/DeadArgumentElimination.cpp
	SRCS_MIN+= Transforms/IPO/ElimAvailExtern.cpp
	SRCS_MIN+= Transforms/IPO/ExtractGV.cpp
	SRCS_MIN+= Transforms/IPO/ForceFunctionAttrs.cpp
	SRCS_MIN+= Transforms/IPO/FunctionAttrs.cpp
	SRCS_MIN+= Transforms/IPO/FunctionImport.cpp
	SRCS_MIN+= Transforms/IPO/GlobalDCE.cpp
	SRCS_MIN+= Transforms/IPO/GlobalOpt.cpp
	SRCS_MIN+= Transforms/IPO/GlobalSplit.cpp
	SRCS_MIN+= Transforms/IPO/IPConstantPropagation.cpp
	SRCS_EXT+= Transforms/IPO/IPO.cpp
	SRCS_MIN+= Transforms/IPO/InferFunctionAttrs.cpp
	SRCS_MIN+= Transforms/IPO/InlineSimple.cpp
	SRCS_MIN+= Transforms/IPO/Inliner.cpp
	SRCS_MIN+= Transforms/IPO/Internalize.cpp
	SRCS_MIN+= Transforms/IPO/LoopExtractor.cpp
	SRCS_MIN+= Transforms/IPO/LowerTypeTests.cpp
	SRCS_MIN+= Transforms/IPO/MergeFunctions.cpp
	SRCS_MIN+= Transforms/IPO/PartialInlining.cpp
	SRCS_MIN+= Transforms/IPO/PassManagerBuilder.cpp
	SRCS_MIN+= Transforms/IPO/PruneEH.cpp
	SRCS_MIN+= Transforms/IPO/SampleProfile.cpp
	SRCS_MIN+= Transforms/IPO/StripDeadPrototypes.cpp
	SRCS_MIN+= Transforms/IPO/StripSymbols.cpp
	SRCS_MIN+= Transforms/IPO/ThinLTOBitcodeWriter.cpp
	SRCS_MIN+= Transforms/IPO/WholeProgramDevirt.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineAddSub.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineAndOrXor.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineCalls.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineCasts.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineCompares.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineMulDivRem.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombinePHI.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineSelect.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineShifts.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
	SRCS_MIN+= Transforms/InstCombine/InstCombineVectorOps.cpp
	SRCS_MIN+= Transforms/InstCombine/InstructionCombining.cpp
	SRCS_MIN+= Transforms/Instrumentation/AddressSanitizer.cpp
	SRCS_MIN+= Transforms/Instrumentation/BoundsChecking.cpp
	SRCS_MIN+= Transforms/Instrumentation/DataFlowSanitizer.cpp
	SRCS_MIN+= Transforms/Instrumentation/EfficiencySanitizer.cpp
	SRCS_MIN+= Transforms/Instrumentation/GCOVProfiling.cpp
	SRCS_MIN+= Transforms/Instrumentation/HWAddressSanitizer.cpp
	SRCS_MIN+= Transforms/Instrumentation/IndirectCallPromotion.cpp
	SRCS_MIN+= Transforms/Instrumentation/InstrProfiling.cpp
	SRCS_MIN+= Transforms/Instrumentation/Instrumentation.cpp
	SRCS_MIN+= Transforms/Instrumentation/MemorySanitizer.cpp
	SRCS_MIN+= Transforms/Instrumentation/PGOInstrumentation.cpp
	SRCS_MIN+= Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
	SRCS_MIN+= Transforms/Instrumentation/SanitizerCoverage.cpp
	SRCS_MIN+= Transforms/Instrumentation/ThreadSanitizer.cpp
	SRCS_MIN+= Transforms/ObjCARC/DependencyAnalysis.cpp
	SRCS_EXT+= Transforms/ObjCARC/ObjCARC.cpp
	SRCS_MIN+= Transforms/ObjCARC/ObjCARCAPElim.cpp
	SRCS_MIN+= Transforms/ObjCARC/ObjCARCContract.cpp
	SRCS_MIN+= Transforms/ObjCARC/ObjCARCExpand.cpp
	SRCS_MIN+= Transforms/ObjCARC/ObjCARCOpts.cpp
	SRCS_MIN+= Transforms/ObjCARC/ProvenanceAnalysis.cpp
	SRCS_MIN+= Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
	SRCS_MIN+= Transforms/ObjCARC/PtrState.cpp
	SRCS_MIN+= Transforms/Scalar/ADCE.cpp
	SRCS_MIN+= Transforms/Scalar/AlignmentFromAssumptions.cpp
	SRCS_MIN+= Transforms/Scalar/BDCE.cpp
	SRCS_MIN+= Transforms/Scalar/CallSiteSplitting.cpp
	SRCS_MIN+= Transforms/Scalar/ConstantHoisting.cpp
	SRCS_MIN+= Transforms/Scalar/ConstantProp.cpp
	SRCS_MIN+= Transforms/Scalar/CorrelatedValuePropagation.cpp
	SRCS_MIN+= Transforms/Scalar/DCE.cpp
	SRCS_MIN+= Transforms/Scalar/DeadStoreElimination.cpp
	SRCS_MIN+= Transforms/Scalar/DivRemPairs.cpp
	SRCS_MIN+= Transforms/Scalar/EarlyCSE.cpp
	SRCS_MIN+= Transforms/Scalar/FlattenCFGPass.cpp
	SRCS_MIN+= Transforms/Scalar/Float2Int.cpp
	SRCS_MIN+= Transforms/Scalar/GVN.cpp
	SRCS_MIN+= Transforms/Scalar/GVNHoist.cpp
	SRCS_MIN+= Transforms/Scalar/GVNSink.cpp
	SRCS_MIN+= Transforms/Scalar/GuardWidening.cpp
	SRCS_MIN+= Transforms/Scalar/IVUsersPrinter.cpp
	SRCS_MIN+= Transforms/Scalar/IndVarSimplify.cpp
	SRCS_MIN+= Transforms/Scalar/InductiveRangeCheckElimination.cpp
	SRCS_EXT+= Transforms/Scalar/InferAddressSpaces.cpp
	SRCS_MIN+= Transforms/Scalar/JumpThreading.cpp
	SRCS_MIN+= Transforms/Scalar/LICM.cpp
	SRCS_MIN+= Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
	SRCS_MIN+= Transforms/Scalar/LoopDataPrefetch.cpp
	SRCS_MIN+= Transforms/Scalar/LoopDeletion.cpp
	SRCS_MIN+= Transforms/Scalar/LoopDistribute.cpp
	SRCS_MIN+= Transforms/Scalar/LoopIdiomRecognize.cpp
	SRCS_MIN+= Transforms/Scalar/LoopInstSimplify.cpp
	SRCS_MIN+= Transforms/Scalar/LoopInterchange.cpp
	SRCS_MIN+= Transforms/Scalar/LoopLoadElimination.cpp
	SRCS_MIN+= Transforms/Scalar/LoopPassManager.cpp
	SRCS_MIN+= Transforms/Scalar/LoopPredication.cpp
	SRCS_MIN+= Transforms/Scalar/LoopRerollPass.cpp
	SRCS_MIN+= Transforms/Scalar/LoopRotation.cpp
	SRCS_MIN+= Transforms/Scalar/LoopSimplifyCFG.cpp
	SRCS_MIN+= Transforms/Scalar/LoopSink.cpp
	SRCS_MIN+= Transforms/Scalar/LoopStrengthReduce.cpp
	SRCS_MIN+= Transforms/Scalar/LoopUnrollPass.cpp
	SRCS_MIN+= Transforms/Scalar/LoopUnswitch.cpp
	SRCS_MIN+= Transforms/Scalar/LoopVersioningLICM.cpp
	SRCS_MIN+= Transforms/Scalar/LowerAtomic.cpp
	SRCS_MIN+= Transforms/Scalar/LowerExpectIntrinsic.cpp
	SRCS_MIN+= Transforms/Scalar/LowerGuardIntrinsic.cpp
	SRCS_MIN+= Transforms/Scalar/MemCpyOptimizer.cpp
	SRCS_MIN+= Transforms/Scalar/MergeICmps.cpp
	SRCS_MIN+= Transforms/Scalar/MergedLoadStoreMotion.cpp
	SRCS_MIN+= Transforms/Scalar/NaryReassociate.cpp
	SRCS_MIN+= Transforms/Scalar/NewGVN.cpp
	SRCS_MIN+= Transforms/Scalar/PartiallyInlineLibCalls.cpp
	SRCS_MIN+= Transforms/Scalar/PlaceSafepoints.cpp
	SRCS_MIN+= Transforms/Scalar/Reassociate.cpp
	SRCS_MIN+= Transforms/Scalar/Reg2Mem.cpp
	SRCS_MIN+= Transforms/Scalar/RewriteStatepointsForGC.cpp
	SRCS_MIN+= Transforms/Scalar/SCCP.cpp
	SRCS_MIN+= Transforms/Scalar/SROA.cpp
	SRCS_EXT+= Transforms/Scalar/Scalar.cpp
	SRCS_MIN+= Transforms/Scalar/Scalarizer.cpp
	SRCS_MIN+= Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
	SRCS_MIN+= Transforms/Scalar/SimpleLoopUnswitch.cpp
	SRCS_MIN+= Transforms/Scalar/SimplifyCFGPass.cpp
	SRCS_MIN+= Transforms/Scalar/Sink.cpp
	SRCS_MIN+= Transforms/Scalar/SpeculateAroundPHIs.cpp
	SRCS_MIN+= Transforms/Scalar/SpeculativeExecution.cpp
	SRCS_MIN+= Transforms/Scalar/StraightLineStrengthReduce.cpp
	SRCS_MIN+= Transforms/Scalar/StructurizeCFG.cpp
	SRCS_MIN+= Transforms/Scalar/TailRecursionElimination.cpp
	SRCS_MIN+= Transforms/Utils/ASanStackFrameLayout.cpp
	SRCS_MIN+= Transforms/Utils/AddDiscriminators.cpp
	SRCS_MIN+= Transforms/Utils/BasicBlockUtils.cpp
	SRCS_MIN+= Transforms/Utils/BreakCriticalEdges.cpp
	SRCS_MIN+= Transforms/Utils/BuildLibCalls.cpp
	SRCS_MIN+= Transforms/Utils/BypassSlowDivision.cpp
	SRCS_MIN+= Transforms/Utils/CallPromotionUtils.cpp
	SRCS_MIN+= Transforms/Utils/CloneFunction.cpp
	SRCS_MIN+= Transforms/Utils/CloneModule.cpp
	SRCS_MIN+= Transforms/Utils/CodeExtractor.cpp
	SRCS_MIN+= Transforms/Utils/CtorUtils.cpp
	SRCS_MIN+= Transforms/Utils/DemoteRegToStack.cpp
	SRCS_MIN+= Transforms/Utils/EntryExitInstrumenter.cpp
	SRCS_MIN+= Transforms/Utils/EscapeEnumerator.cpp
	SRCS_MIN+= Transforms/Utils/Evaluator.cpp
	SRCS_MIN+= Transforms/Utils/FlattenCFG.cpp
	SRCS_MIN+= Transforms/Utils/FunctionComparator.cpp
	SRCS_MIN+= Transforms/Utils/FunctionImportUtils.cpp
	SRCS_MIN+= Transforms/Utils/GlobalStatus.cpp
	SRCS_MIN+= Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
	SRCS_MIN+= Transforms/Utils/InlineFunction.cpp
	SRCS_MIN+= Transforms/Utils/InstructionNamer.cpp
	SRCS_MIN+= Transforms/Utils/IntegerDivision.cpp
	SRCS_MIN+= Transforms/Utils/LCSSA.cpp
	SRCS_MIN+= Transforms/Utils/LibCallsShrinkWrap.cpp
	SRCS_MIN+= Transforms/Utils/Local.cpp
	SRCS_MIN+= Transforms/Utils/LoopSimplify.cpp
	SRCS_MIN+= Transforms/Utils/LoopUnroll.cpp
	SRCS_MIN+= Transforms/Utils/LoopUnrollPeel.cpp
	SRCS_MIN+= Transforms/Utils/LoopUnrollRuntime.cpp
	SRCS_MIN+= Transforms/Utils/LoopUtils.cpp
	SRCS_MIN+= Transforms/Utils/LoopVersioning.cpp
	SRCS_MIN+= Transforms/Utils/LowerInvoke.cpp
	SRCS_MIN+= Transforms/Utils/LowerSwitch.cpp
	SRCS_MIN+= Transforms/Utils/Mem2Reg.cpp
	SRCS_MIN+= Transforms/Utils/MetaRenamer.cpp
	SRCS_MIN+= Transforms/Utils/ModuleUtils.cpp
	SRCS_MIN+= Transforms/Utils/NameAnonGlobals.cpp
	SRCS_MIN+= Transforms/Utils/OrderedInstructions.cpp
	SRCS_MIN+= Transforms/Utils/PredicateInfo.cpp
	SRCS_MIN+= Transforms/Utils/PromoteMemoryToRegister.cpp
	SRCS_MIN+= Transforms/Utils/SSAUpdater.cpp
	SRCS_MIN+= Transforms/Utils/SanitizerStats.cpp
	SRCS_MIN+= Transforms/Utils/SimplifyCFG.cpp
	SRCS_MIN+= Transforms/Utils/SimplifyIndVar.cpp
	SRCS_MIN+= Transforms/Utils/SimplifyInstructions.cpp
	SRCS_MIN+= Transforms/Utils/SimplifyLibCalls.cpp
	SRCS_MIN+= Transforms/Utils/SplitModule.cpp
	SRCS_MIN+= Transforms/Utils/StripGCRelocates.cpp
	SRCS_MIN+= Transforms/Utils/StripNonLineTableDebugInfo.cpp
	SRCS_MIN+= Transforms/Utils/SymbolRewriter.cpp
	SRCS_MIN+= Transforms/Utils/UnifyFunctionExitNodes.cpp
	SRCS_EXT+= Transforms/Utils/Utils.cpp
	SRCS_MIN+= Transforms/Utils/VNCoercion.cpp
	SRCS_MIN+= Transforms/Utils/ValueMapper.cpp
	SRCS_MIN+= Transforms/Vectorize/LoadStoreVectorizer.cpp
	SRCS_MIN+= Transforms/Vectorize/LoopVectorize.cpp
	SRCS_MIN+= Transforms/Vectorize/SLPVectorizer.cpp
	SRCS_MIN+= Transforms/Vectorize/VPlan.cpp
	SRCS_EXT+= Transforms/Vectorize/Vectorize.cpp
	SRCS_EXT+= XRay/InstrumentationMap.cpp

	SRCS_ALL+= ${SRCS_MIN}
	.if !defined(TOOLS_PREFIX) \|\| ${MK_LLD_IS_LD} != "no"
	SRCS_ALL+= ${SRCS_MIW}
	.endif
	.if ${MK_CLANG_EXTRAS} != "no"
	SRCS_ALL+= ${SRCS_EXT}
	.endif
	.if ${MK_CLANG_FULL} != "no"
	SRCS_ALL+= ${SRCS_FUL}
	.endif
	.if ${MK_CLANG_EXTRAS} != "no" \|\| ${MK_LLD} != "no"
	SRCS_ALL+= ${SRCS_EXL}
	.endif
	.if ${MK_LLD} != "no"
	SRCS_ALL+= ${SRCS_LLD}
	.endif
	.if ${MK_CLANG_EXTRAS} != "no" \|\| ${MK_LLDB} != "no"
	SRCS_ALL+= ${SRCS_XDB}
	.endif
	.if ${MK_CLANG_EXTRAS} != "no" \|\| ${MK_LLDB} != "no" \|\| ${MK_LLD} != "no"
	SRCS_ALL+= ${SRCS_XDL}
	.endif
	.if ${MK_CLANG_EXTRAS} != "no" \|\| ${MK_LLDB} != "no" \|\| !defined(TOOLS_PREFIX)
	SRCS_ALL+= ${SRCS_XDW}
	.endif
	SRCS+= ${SRCS_ALL:O}

	llvm/IR/Attributes.gen: ${LLVM_SRCS}/include/llvm/IR/Attributes.td
	${LLVM_TBLGEN} -gen-attrs \
	-I ${LLVM_SRCS}/include -d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/include/llvm/IR/Attributes.td
	TGHDRS+= llvm/IR/Attributes.gen

	llvm/IR/Intrinsics.gen: ${LLVM_SRCS}/include/llvm/IR/Intrinsics.td
	${LLVM_TBLGEN} -gen-intrinsic \
	-I ${LLVM_SRCS}/include -d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/include/llvm/IR/Intrinsics.td
	TGHDRS+= llvm/IR/Intrinsics.gen

	AttributesCompatFunc.inc: ${LLVM_SRCS}/lib/IR/AttributesCompatFunc.td
	${LLVM_TBLGEN} -gen-attrs \
	-I ${LLVM_SRCS}/include -d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/lib/IR/AttributesCompatFunc.td
	TGHDRS+= AttributesCompatFunc.inc

	llvm-lib/Options.inc: ${LLVM_SRCS}/lib/ToolDrivers/llvm-lib/Options.td
	${LLVM_TBLGEN} -gen-opt-parser-defs \
	-I ${LLVM_SRCS}/include -d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/lib/ToolDrivers/llvm-lib/Options.td
	TGHDRS+= llvm-lib/Options.inc
	CFLAGS.LibDriver.cpp+= -I${.OBJDIR}/llvm-lib

	llvm-dlltool/Options.inc: ${LLVM_SRCS}/lib/ToolDrivers/llvm-dlltool/Options.td
	${LLVM_TBLGEN} -gen-opt-parser-defs \
	-I ${LLVM_SRCS}/include -d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/lib/ToolDrivers/llvm-dlltool/Options.td
	TGHDRS+= llvm-dlltool/Options.inc
	CFLAGS.DlltoolDriver.cpp+= -I${.OBJDIR}/llvm-dlltool

	beforebuild:
	# 20170724 remove stale Options.inc file, of which there are two different
	# versions after upstream r308421, one for llvm-lib, one for llvm-dlltool
	.for f in Options.inc
	.if exists(${f}) \|\| exists(${f}.d)
	@echo Removing stale generated ${f} files
	@rm -f ${f} ${f}.d
	.endif
	.endfor

	# Note: some rules are superfluous, not every combination is valid.
	.for arch in \
	AArch64/AArch64 ARM/ARM Mips/Mips PowerPC/PPC Sparc/Sparc X86/X86
	. for hdr in \
	AsmMatcher/-gen-asm-matcher \
	AsmWriter1/-gen-asm-writer,-asmwriternum=1 \
	AsmWriter/-gen-asm-writer \
	CallingConv/-gen-callingconv \
	CodeEmitter/-gen-emitter \
	DAGISel/-gen-dag-isel \
	DisassemblerTables/-gen-disassembler \
	EVEX2VEXTables/-gen-x86-EVEX2VEX-tables \
	FastISel/-gen-fast-isel \
	GlobalISel/-gen-global-isel \
	InstrInfo/-gen-instr-info \
	MCCodeEmitter/-gen-emitter \
	MCPseudoLowering/-gen-pseudo-lowering \
	RegisterBank/-gen-register-bank \
	RegisterInfo/-gen-register-info \
	SubtargetInfo/-gen-subtarget \
	SystemOperands/-gen-searchable-tables \
	SystemRegister/-gen-searchable-tables
	${arch:T}Gen${hdr:H}.inc: ${LLVM_SRCS}/lib/Target/${arch:H}/${arch:T}.td
	${LLVM_TBLGEN} ${hdr:T:C/,/ /g} \
	-I ${LLVM_SRCS}/include -I ${LLVM_SRCS}/lib/Target/${arch:H} \
	-d ${.TARGET}.d -o ${.TARGET} \
	${LLVM_SRCS}/lib/Target/${arch:H}/${arch:T}.td
	. endfor
	.endfor
	TGHDRS+= AArch64GenAsmMatcher.inc
	TGHDRS+= AArch64GenAsmWriter.inc
	TGHDRS+= AArch64GenAsmWriter1.inc
	TGHDRS+= AArch64GenCallingConv.inc
	TGHDRS+= AArch64GenDAGISel.inc
	TGHDRS+= AArch64GenDisassemblerTables.inc
	TGHDRS+= AArch64GenFastISel.inc
	TGHDRS+= AArch64GenGlobalISel.inc
	TGHDRS+= AArch64GenInstrInfo.inc
	TGHDRS+= AArch64GenMCCodeEmitter.inc
	TGHDRS+= AArch64GenMCPseudoLowering.inc
	TGHDRS+= AArch64GenRegisterBank.inc
	TGHDRS+= AArch64GenRegisterInfo.inc
	TGHDRS+= AArch64GenSubtargetInfo.inc
	TGHDRS+= AArch64GenSystemOperands.inc
	TGHDRS+= ARMGenAsmMatcher.inc
	TGHDRS+= ARMGenAsmWriter.inc
	TGHDRS+= ARMGenCallingConv.inc
	TGHDRS+= ARMGenDAGISel.inc
	TGHDRS+= ARMGenDisassemblerTables.inc
	TGHDRS+= ARMGenFastISel.inc
	TGHDRS+= ARMGenGlobalISel.inc
	TGHDRS+= ARMGenInstrInfo.inc
	TGHDRS+= ARMGenMCCodeEmitter.inc
	TGHDRS+= ARMGenMCPseudoLowering.inc
	TGHDRS+= ARMGenRegisterBank.inc
	TGHDRS+= ARMGenRegisterInfo.inc
	TGHDRS+= ARMGenSubtargetInfo.inc
	TGHDRS+= ARMGenSystemRegister.inc
	TGHDRS+= MipsGenAsmMatcher.inc
	TGHDRS+= MipsGenAsmWriter.inc
	TGHDRS+= MipsGenCallingConv.inc
	TGHDRS+= MipsGenDAGISel.inc
	TGHDRS+= MipsGenDisassemblerTables.inc
	TGHDRS+= MipsGenFastISel.inc
	TGHDRS+= MipsGenInstrInfo.inc
	TGHDRS+= MipsGenMCCodeEmitter.inc
	TGHDRS+= MipsGenMCPseudoLowering.inc
	TGHDRS+= MipsGenRegisterInfo.inc
	TGHDRS+= MipsGenSubtargetInfo.inc
	TGHDRS+= PPCGenAsmMatcher.inc
	TGHDRS+= PPCGenAsmWriter.inc
	TGHDRS+= PPCGenCallingConv.inc
	TGHDRS+= PPCGenDAGISel.inc
	TGHDRS+= PPCGenDisassemblerTables.inc
	TGHDRS+= PPCGenFastISel.inc
	TGHDRS+= PPCGenInstrInfo.inc
	TGHDRS+= PPCGenMCCodeEmitter.inc
	TGHDRS+= PPCGenRegisterInfo.inc
	TGHDRS+= PPCGenSubtargetInfo.inc
	TGHDRS+= SparcGenAsmMatcher.inc
	TGHDRS+= SparcGenAsmWriter.inc
	TGHDRS+= SparcGenCallingConv.inc
	TGHDRS+= SparcGenDAGISel.inc
	TGHDRS+= SparcGenDisassemblerTables.inc
	TGHDRS+= SparcGenInstrInfo.inc
	TGHDRS+= SparcGenMCCodeEmitter.inc
	TGHDRS+= SparcGenRegisterInfo.inc
	TGHDRS+= SparcGenSubtargetInfo.inc
	TGHDRS+= X86GenAsmMatcher.inc
	TGHDRS+= X86GenAsmWriter.inc
	TGHDRS+= X86GenAsmWriter1.inc
	TGHDRS+= X86GenCallingConv.inc
	TGHDRS+= X86GenDAGISel.inc
	TGHDRS+= X86GenDisassemblerTables.inc
	TGHDRS+= X86GenEVEX2VEXTables.inc
	TGHDRS+= X86GenFastISel.inc
	TGHDRS+= X86GenGlobalISel.inc
	TGHDRS+= X86GenInstrInfo.inc
	TGHDRS+= X86GenRegisterBank.inc
	TGHDRS+= X86GenRegisterInfo.inc
	TGHDRS+= X86GenSubtargetInfo.inc

	DEPENDFILES+= ${TGHDRS:C/$/.d/}
	DPSRCS+= ${TGHDRS}
	CLEANFILES+= ${TGHDRS} ${TGHDRS:C/$/.d/}

	.include "../llvm.build.mk"
	.include <bsd.lib.mk>

File Metadata

Mime Type: text/x-c
Expires: Sun, May 3, 12:45 AM (2 d)
Storage Engine: local-disk
Storage Format: Raw Data
Storage Handle: 21/03/fc2ad90eaaf0ef1f51567f31a3ba
Default Alt Text: (3 MB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions